Permalink
Browse files

Calculate length of unicode string without depending on Python's impl…

…ementation (#152)
  • Loading branch information...
contrepoint authored and andychu committed Jun 25, 2018
1 parent f1fea78 commit 916d440646401e9cdaddbfa506ef71fd772a68cf
Showing with 182 additions and 3 deletions.
  1. +46 −0 core/libstr.py
  2. +1 −3 core/word_eval.py
  3. +1 −0 spec/testdata/utf8-chars.txt
  4. +134 −0 spec/var-op-other.test.sh
View
@@ -56,6 +56,52 @@ def Utf8Encode(code):
return ''.join(chr(b & 0xFF) for b in bytes_)
INCOMPLETE = 'error: Incomplete utf-8'
INVALID_CONT = 'error: Invalid utf-8 continuation byte'
INVALID_START = 'error: Invalid start of utf-8 char'
def _CheckContinuationByte(byte):
if (ord(byte) >> 6) != 0b10:
raise RuntimeError
def NumOfUtf8Chars(bytes):
"""Returns the number of utf-8 characters in the byte string 's'."""
num_of_utf8_chars = 0
num_bytes = len(bytes)
i = 0
while i < num_bytes:
byte_as_int = ord(bytes[i])
try:
if (byte_as_int >> 7) == 0b0:
i += 1
elif (byte_as_int >> 5) == 0b110:
_CheckContinuationByte(bytes[i+1])
i += 2
elif (byte_as_int >> 4) == 0b1110:
_CheckContinuationByte(bytes[i+1])
_CheckContinuationByte(bytes[i+2])
i += 3
elif (byte_as_int >> 3) == 0b11110:
_CheckContinuationByte(bytes[i+1])
_CheckContinuationByte(bytes[i+2])
_CheckContinuationByte(bytes[i+3])
i += 4
else:
return INVALID_START
except IndexError:
return INCOMPLETE
except RuntimeError:
return INVALID_CONT
num_of_utf8_chars += 1
return num_of_utf8_chars
# Implementation without Python regex:
#
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
View
@@ -298,9 +298,7 @@ def _ApplyPrefixOp(self, val, op_id):
if op_id == Id.VSub_Pound: # LENGTH
if val.tag == value_e.Str:
unicode_val = val.s.decode('utf-8')
length = len(unicode_val)
# length = len(val.s)
length = libstr.NumOfUtf8Chars(val.s)
elif val.tag == value_e.StrArray:
# There can be empty placeholder values in the array.
length = sum(1 for s in val.strs if s is not None)
@@ -0,0 +1 @@
z λ 三 😘
View
@@ -16,6 +16,140 @@ echo ${#v}
## BUG dash stdout: 9
## BUG mksh stdout: 4
### Unicode string length (spec/testdata/utf8-chars.txt)
v=$(cat spec/testdata/utf8-chars.txt)
echo ${#v}
## stdout: 7
## BUG dash stdout: 13
## BUG mksh stdout: 13
### String length with incomplete utf-8
for num_bytes in 0 1 2 3 4 5 6 7 8 9 10 11 12 13; do
s=$(head -c $num_bytes spec/testdata/utf8-chars.txt)
echo ${#s}
done
## STDOUT:
0
1
2
error: Incomplete utf-8
3
4
error: Incomplete utf-8
error: Incomplete utf-8
5
6
error: Incomplete utf-8
error: Incomplete utf-8
error: Incomplete utf-8
7
## END
## BUG bash STDOUT:
0
1
2
3
3
4
5
6
5
6
7
8
9
7
## END
## BUG dash/mksh STDOUT:
0
1
2
3
4
5
6
7
8
9
10
11
12
13
## END
### String length with invalid utf-8 continuation bytes
for num_bytes in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
s=$(head -c $num_bytes spec/testdata/utf8-chars.txt)$(echo -e "\xFF")
echo ${#s}
done
## STDOUT:
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
## END
## BUG bash STDOUT:
1
2
3
4
4
5
6
7
6
7
8
9
10
8
8
## BUG dash STDOUT:
7
8
9
10
11
12
13
14
15
16
17
18
19
20
20
## END
## BUG mksh STDOUT:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
14
## END
### Length of undefined variable
echo ${#undef}
# stdout: 0

0 comments on commit 916d440

Please sign in to comment.