Permalink
Browse files
Move Utf8Encode to libstr, in preparation for decoding.
- Loading branch information...
Showing
with
52 additions
and
48 deletions.
-
+34
−0
core/libstr.py
-
+15
−0
core/libstr_test.py
-
+2
−34
core/word_compile.py
-
+1
−14
core/word_compile_test.py
|
|
@@ -22,6 +22,40 @@ |
|
|
log = util.log
|
|
|
e_die = util.e_die
|
|
|
|
|
|
|
|
|
def Utf8Encode(code):
|
|
|
"""Return utf-8 encoded bytes from a unicode code point.
|
|
|
|
|
|
Based on https://stackoverflow.com/a/23502707
|
|
|
"""
|
|
|
num_cont_bytes = 0
|
|
|
|
|
|
if code <= 0x7F:
|
|
|
return chr(code & 0x7F) # ASCII
|
|
|
|
|
|
elif code <= 0x7FF:
|
|
|
num_cont_bytes = 1
|
|
|
elif code <= 0xFFFF:
|
|
|
num_cont_bytes = 2
|
|
|
elif code <= 0x10FFFF:
|
|
|
num_cont_bytes = 3
|
|
|
|
|
|
else:
|
|
|
return '\xEF\xBF\xBD' # unicode replacement character
|
|
|
|
|
|
bytes_ = []
|
|
|
for _ in xrange(num_cont_bytes):
|
|
|
bytes_.append(0x80 | (code & 0x3F))
|
|
|
code >>= 6
|
|
|
|
|
|
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
|
|
|
bytes_.append(b)
|
|
|
bytes_.reverse()
|
|
|
|
|
|
# mod 256 because Python ints don't wrap around!
|
|
|
return ''.join(chr(b & 0xFF) for b in bytes_)
|
|
|
|
|
|
|
|
|
# Implementation without Python regex:
|
|
|
#
|
|
|
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
|
|
|
|
|
|
@@ -11,6 +11,21 @@ |
|
|
|
|
|
class LibStrTest(unittest.TestCase):
|
|
|
|
|
|
def testUtf8Encode(self):
|
|
|
CASES = [
|
|
|
(u'\u0065'.encode('utf-8'), 0x0065),
|
|
|
(u'\u0100'.encode('utf-8'), 0x0100),
|
|
|
(u'\u1234'.encode('utf-8'), 0x1234),
|
|
|
(u'\U00020000'.encode('utf-8'), 0x00020000),
|
|
|
# Out of range gives Unicode replacement character.
|
|
|
('\xef\xbf\xbd', 0x10020000),
|
|
|
]
|
|
|
|
|
|
for expected, code_point in CASES:
|
|
|
print('')
|
|
|
print('Utf8Encode case %r %r' % (expected, code_point))
|
|
|
self.assertEqual(expected, libstr.Utf8Encode(code_point))
|
|
|
|
|
|
def testUnarySuffixOpDemo(self):
|
|
|
print(libstr)
|
|
|
|
|
|
|
|
|
@@ -7,6 +7,7 @@ |
|
|
"""
|
|
|
|
|
|
from core import util
|
|
|
from core import libstr
|
|
|
|
|
|
from osh.meta import Id
|
|
|
from osh.meta import runtime
|
|
|
@@ -31,39 +32,6 @@ |
|
|
}
|
|
|
|
|
|
|
|
|
def Utf8Encode(code):
|
|
|
"""Return utf-8 encoded bytes from a unicode code point.
|
|
|
|
|
|
Based on https://stackoverflow.com/a/23502707
|
|
|
"""
|
|
|
num_cont_bytes = 0
|
|
|
|
|
|
if code <= 0x7F:
|
|
|
return chr(code & 0x7F) # ASCII
|
|
|
|
|
|
elif code <= 0x7FF:
|
|
|
num_cont_bytes = 1
|
|
|
elif code <= 0xFFFF:
|
|
|
num_cont_bytes = 2
|
|
|
elif code <= 0x10FFFF:
|
|
|
num_cont_bytes = 3
|
|
|
|
|
|
else:
|
|
|
return '\xEF\xBF\xBD' # unicode replacement character
|
|
|
|
|
|
bytes_ = []
|
|
|
for _ in xrange(num_cont_bytes):
|
|
|
bytes_.append(0x80 | (code & 0x3F))
|
|
|
code >>= 6
|
|
|
|
|
|
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
|
|
|
bytes_.append(b)
|
|
|
bytes_.reverse()
|
|
|
|
|
|
# mod 256 because Python ints don't wrap around!
|
|
|
return ''.join(chr(b & 0xFF) for b in bytes_)
|
|
|
|
|
|
|
|
|
# TODO: Strict mode syntax errors:
|
|
|
#
|
|
|
# \x is a syntax error -- needs two digits (It's like this in C)
|
|
|
@@ -114,7 +82,7 @@ def EvalCStringToken(id_, value): |
|
|
s = value[2:]
|
|
|
i = int(s, 16)
|
|
|
#util.log('i = %d', i)
|
|
|
return Utf8Encode(i)
|
|
|
return libstr.Utf8Encode(i)
|
|
|
|
|
|
else:
|
|
|
raise AssertionError
|
|
|
|
|
|
@@ -9,21 +9,8 @@ |
|
|
|
|
|
|
|
|
class WordCompileTest(unittest.TestCase):
|
|
|
pass
|
|
|
|
|
|
def testUtf8Encode(self):
|
|
|
CASES = [
|
|
|
(u'\u0065'.encode('utf-8'), 0x0065),
|
|
|
(u'\u0100'.encode('utf-8'), 0x0100),
|
|
|
(u'\u1234'.encode('utf-8'), 0x1234),
|
|
|
(u'\U00020000'.encode('utf-8'), 0x00020000),
|
|
|
# Out of range gives Unicode replacement character.
|
|
|
('\xef\xbf\xbd', 0x10020000),
|
|
|
]
|
|
|
|
|
|
for expected, code_point in CASES:
|
|
|
print('')
|
|
|
print('Utf8Encode case %r %r' % (expected, code_point))
|
|
|
self.assertEqual(expected, word_compile.Utf8Encode(code_point))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
unittest.main()
|
0 comments on commit
5441874