Permalink
Browse files

Refactor: use a variable length array of bytes_

  • Loading branch information...
Andy Chu
Andy Chu committed Jun 5, 2018
1 parent b954b69 commit 8e772f972960556308e5ae8430426f5578b22ae1
Showing with 56 additions and 24 deletions.
  1. +26 −24 core/word_compile.py
  2. +30 −0 core/word_compile_test.py
View
@@ -39,40 +39,42 @@ def Utf8Encode(code):
utf-8 encoded str
"""
#print('Utf8Encode code %r' % code)
chars = [0] * 4
if code <= 0x7F:
chars[0] = code & 0x7F
count = 0
bytes_ = [code & 0x7F]
# chars[0] = code & 0x7F
# count = 0
elif code > 0x10FFFF:
# unicode replacement character
chars[0] = 0xEF
chars[1] = 0xBF
chars[2] = 0xBD
chars[3] = 0
count = 2
bytes_ = [0xEF, 0xBF, 0xBD]
# chars[0] = 0xEF
# chars[1] = 0xBF
# chars[2] = 0xBD
# chars[3] = 0
# count = 2
else:
if code <= 0x7FF:
print('==== c=1')
# one continuation byte
count = 1
num_continuation_bytes = 1
elif code <= 0xFFFF:
# two continuation bytes
count = 2
num_continuation_bytes = 2
else:
# three continuation bytes
count = 3
for i in xrange(count):
chars[count-i] = 0x80 | (code & 0x3F)
num_continuation_bytes = 3
bytes_ = []
for i in xrange(num_continuation_bytes):
bytes_.append(0x80 | (code & 0x3F))
#bytes_[count-i] = 0x80 | (code & 0x3F)
code >>= 6
chars[0] = (0x1E << (6-count)) | (code & (0x3F >> count))
bytes_.append((0x1E << (6-num_continuation_bytes)) | (code & (0x3F >> num_continuation_bytes)))
bytes_.reverse()
#chars[1+count] = 0
print('chars %r' % chars)
s = ''
for i in xrange(count+1):
print('i = %d' % chars[i])
s += chr(chars[i] % 256)
return s
# print('chars %r' % chars)
return "".join(chr(b % 256) for b in bytes_)
# s = ''
# for i in xrange(count+1):
# print('i = %d' % chars[i])
# s += chr(chars[i] % 256)
# return s
#return unichr(c).encode('utf-8')
View
@@ -0,0 +1,30 @@
#!/usr/bin/python -S
"""
word_compile_test.py: Tests for word_compile.py
"""
import unittest
from core import word_compile # module under test
class WordCompileTest(unittest.TestCase):
def testUtf8Encode(self):
CASES = [
(u'\u0065'.encode('utf-8'), 0x0065),
(u'\u0100'.encode('utf-8'), 0x0100),
(u'\u1234'.encode('utf-8'), 0x1234),
(u'\U00020000'.encode('utf-8'), 0x00020000),
# Out of range gives Unicode replacement character.
#('\xef\xbf\xbd', 0x100200),
(None, 0x100200),
]
for expected, code_point in CASES:
print('')
print('Utf8Encode case %r %r' % (expected, code_point))
self.assertEqual(expected, word_compile.Utf8Encode(code_point))
if __name__ == '__main__':
unittest.main()

0 comments on commit 8e772f9

Please sign in to comment.