Permalink
Browse files

got basic utf-8 encoding working, needs cleanup

  • Loading branch information...
Andy Chu
Andy Chu committed Jun 5, 2018
1 parent 00c299e commit b954b6992f4f81e416cbcca05fc9c5d871775836
Showing with 48 additions and 0 deletions.
  1. +47 −0 core/word_compile.py
  2. +1 −0 native/libc_test.py
View
47 core/word_compile.py 100644 → 100755
@@ -30,6 +30,52 @@
'"': '"', # not sure why this is escaped within $''
}
def Utf8Encode(code):
"""
Args:
code: Unicode code point (integer)
Returns:
utf-8 encoded str
"""
#print('Utf8Encode code %r' % code)
chars = [0] * 4
if code <= 0x7F:
chars[0] = code & 0x7F
count = 0
elif code > 0x10FFFF:
# unicode replacement character
chars[0] = 0xEF
chars[1] = 0xBF
chars[2] = 0xBD
chars[3] = 0
count = 2
else:
if code <= 0x7FF:
print('==== c=1')
# one continuation byte
count = 1
elif code <= 0xFFFF:
# two continuation bytes
count = 2
else:
# three continuation bytes
count = 3
for i in xrange(count):
chars[count-i] = 0x80 | (code & 0x3F)
code >>= 6
chars[0] = (0x1E << (6-count)) | (code & (0x3F >> count))
#chars[1+count] = 0
print('chars %r' % chars)
s = ''
for i in xrange(count+1):
print('i = %d' % chars[i])
s += chr(chars[i] % 256)
return s
#return unichr(c).encode('utf-8')
# TODO: Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
@@ -79,6 +125,7 @@ def EvalCStringToken(id_, value):
elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
s = value[2:]
i = int(s, 16)
#util.log('i = %d', i)
return unichr(i).encode('utf-8') # Stay in the realm of bytes
else:
View
@@ -69,6 +69,7 @@ def testRegex(self):
]
for pat, s, expected in cases:
print('CASE %s' % pat)
actual = libc.regex_match(pat, s)
self.assertEqual(expected, actual)

0 comments on commit b954b69

Please sign in to comment.