Permalink
Browse files

Style changes to UTF-8 code.

- Factor out duplicate code from length op and string slicing.
- Raise the InvalidUtf8 exception.
- Rename functions.
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 17, 2018
1 parent c08e45d commit cad1f8eacab259912910abc063d10eb7bb8b21ef
Showing with 60 additions and 55 deletions.
  1. +51 −52 core/libstr.py
  2. +9 −3 core/word_eval.py
View
@@ -56,17 +56,52 @@ def Utf8Encode(code):
return ''.join(chr(b & 0xFF) for b in bytes_)
INCOMPLETE = 'error: Incomplete utf-8'
INCOMPLETE_CHAR = 'error: Incomplete utf-8'
INVALID_CONT = 'error: Invalid utf-8 continuation byte'
INVALID_START = 'error: Invalid start of utf-8 char'
class InvalidUtf8(Exception):
def __init__(self, msg):
self.msg = msg
def _CheckContinuationByte(byte):
if (ord(byte) >> 6) != 0b10:
raise RuntimeError
raise InvalidUtf8(INVALID_CONT)
def _NextUtf8Char(s, i):
"""
Given a string and a byte offset, returns the byte position of the next char.
Validates UTF-8.
"""
byte_as_int = ord(s[i]) # Should never raise IndexError
try:
if (byte_as_int >> 7) == 0b0:
i += 1
elif (byte_as_int >> 5) == 0b110:
_CheckContinuationByte(s[i+1])
i += 2
elif (byte_as_int >> 4) == 0b1110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
i += 3
elif (byte_as_int >> 3) == 0b11110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
_CheckContinuationByte(s[i+3])
i += 4
else:
raise InvalidUtf8(INVALID_START)
except IndexError:
raise InvalidUtf8(INCOMPLETE_CHAR)
return i
def NumOfUtf8Chars(bytes):
def CountUtf8Chars(s):
"""Returns the number of utf-8 characters in the byte string 's'.
TODO: Raise exception rather than returning a string, so we can set the exit
@@ -77,65 +112,29 @@ def NumOfUtf8Chars(bytes):
$ echo $?
1
"""
num_of_utf8_chars = 0
num_bytes = len(bytes)
num_chars = 0
num_bytes = len(s)
i = 0
while i < num_bytes:
byte_as_int = ord(bytes[i])
try:
if (byte_as_int >> 7) == 0b0:
i += 1
elif (byte_as_int >> 5) == 0b110:
_CheckContinuationByte(bytes[i+1])
i += 2
elif (byte_as_int >> 4) == 0b1110:
_CheckContinuationByte(bytes[i+1])
_CheckContinuationByte(bytes[i+2])
i += 3
elif (byte_as_int >> 3) == 0b11110:
_CheckContinuationByte(bytes[i+1])
_CheckContinuationByte(bytes[i+2])
_CheckContinuationByte(bytes[i+3])
i += 4
else:
return INVALID_START
except IndexError:
return INCOMPLETE
except RuntimeError:
return INVALID_CONT
i = _NextUtf8Char(s, i)
num_chars += 1
return num_chars
num_of_utf8_chars += 1
return num_of_utf8_chars
def AdvanceChars(s, num_chars, byte_offset):
def AdvanceUtf8Chars(s, num_chars, byte_offset):
"""
Advance a certain number of UTF-8 chars, beginning with the given byte
offset. Returns a byte offset.
If we got past the end of the string
"""
num_bytes = len(s)
i = byte_offset # mutated
for _ in xrange(num_chars):
byte_as_int = ord(s[i])
if (byte_as_int >> 7) == 0b0:
i += 1
elif (byte_as_int >> 5) == 0b110:
_CheckContinuationByte(s[i+1])
i += 2
elif (byte_as_int >> 4) == 0b1110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
i += 3
elif (byte_as_int >> 3) == 0b11110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
_CheckContinuationByte(s[i+3])
i += 4
else:
raise AssertionError
for _ in xrange(num_chars):
if i >= num_bytes:
raise RuntimeError('Out of bounds')
i = _NextUtf8Char(s, i)
return i
View
@@ -304,10 +304,16 @@ def _ApplyPrefixOp(self, val, op_id):
# count-bytes?
# https://stackoverflow.com/questions/17368067/length-of-string-in-bash
length = libstr.NumOfUtf8Chars(val.s)
try:
length = libstr.CountUtf8Chars(val.s)
except libstr.InvalidUtf8 as e:
# EARLY RETURN. TODO: Should print to stderr!
return runtime.Str(str(e.msg))
elif val.tag == value_e.StrArray:
# There can be empty placeholder values in the array.
length = sum(1 for s in val.strs if s is not None)
return runtime.Str(str(length))
elif op_id == Id.VSub_Bang:
@@ -591,14 +597,14 @@ def _EvalBracedVarSub(self, part, part_vals, quoted):
if val.tag == value_e.Str: # Slice UTF-8 characters in a string.
s = val.s
if begin >= 0:
byte_begin = libstr.AdvanceChars(s, begin, 0)
byte_begin = libstr.AdvanceUtf8Chars(s, begin, 0)
else:
# How do we count characters from the end? I guess we have to
# decode the whole thing.
raise NotImplementedError
if length is not None:
byte_end = libstr.AdvanceChars(s, length, byte_begin)
byte_end = libstr.AdvanceUtf8Chars(s, length, byte_begin)
else:
byte_end = len(s)

0 comments on commit cad1f8e

Please sign in to comment.