Permalink
Browse files

Got basic UTF-8 aware string slicing working, but with duplicate code.

- Didn't implement negative start indices.
- Discovered another semantic in bash:
  - If the second slice arg is negative, it's a position, not a length!
    Geez.
  - Does that work for arrays too?
- Start running var-op-other tests on ZSH.  All are green or yellow now.

Addresses issue #148.
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 16, 2018
1 parent 32222b2 commit 0f719deba2f97f27778f5e6e8fb0cce115e435eb
Showing with 120 additions and 25 deletions.
  1. +29 −0 core/libstr.py
  2. +20 −10 core/word_eval.py
  3. +70 −14 spec/var-op-other.test.sh
  4. +1 −1 test/spec.sh
View
@@ -111,6 +111,35 @@ def NumOfUtf8Chars(bytes):
return num_of_utf8_chars
def AdvanceChars(s, num_chars, byte_offset):
"""
Advance a certain number of UTF-8 chars, beginning with the given byte
offset. Returns a byte offset.
"""
i = byte_offset # mutated
for _ in xrange(num_chars):
byte_as_int = ord(s[i])
if (byte_as_int >> 7) == 0b0:
i += 1
elif (byte_as_int >> 5) == 0b110:
_CheckContinuationByte(s[i+1])
i += 2
elif (byte_as_int >> 4) == 0b1110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
i += 3
elif (byte_as_int >> 3) == 0b11110:
_CheckContinuationByte(s[i+1])
_CheckContinuationByte(s[i+2])
_CheckContinuationByte(s[i+3])
i += 4
else:
raise AssertionError
return i
# Implementation without Python regex:
#
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
View
@@ -309,6 +309,7 @@ def _ApplyPrefixOp(self, val, op_id):
# There can be empty placeholder values in the array.
length = sum(1 for s in val.strs if s is not None)
return runtime.Str(str(length))
elif op_id == Id.VSub_Bang:
# NOTES:
# - Could translate to eval('$' + name) or eval("\$$name")
@@ -319,6 +320,7 @@ def _ApplyPrefixOp(self, val, op_id):
# Treat the value of the variable as a variable name.
return self.mem.GetVar(val.s)
else:
raise AssertionError(op_id)
@@ -575,32 +577,40 @@ def _EvalBracedVarSub(self, part, part_vals, quoted):
raise AssertionError(val.__class__.__name__)
elif op.tag == suffix_op_e.Slice:
# NOTE: The beginning can be negative, but Python handles this. Might
# want to make it explicit.
# TODO: Check out of bounds errors? begin > end?
# TODO: Check out of bounds errors? begin could be past the beginning.
if op.begin:
begin = self.arith_ev.Eval(op.begin)
else:
begin = 0
if op.length:
length = self.arith_ev.Eval(op.length)
end = begin + length
else:
length = None
end = None # Python supports None as the end
if val.tag == value_e.Str: # Slice characters in a string.
# TODO: Need to support unicode? Write spec # tests.
val = runtime.Str(val.s[begin : end])
if val.tag == value_e.Str: # Slice UTF-8 characters in a string.
s = val.s
if begin >= 0:
byte_begin = libstr.AdvanceChars(s, begin, 0)
else:
# How do we count characters from the end? I guess we have to
# decode the whole thing.
raise NotImplementedError
if length is not None:
byte_end = libstr.AdvanceChars(s, length, byte_begin)
else:
byte_end = len(s)
val = runtime.Str(s[byte_begin : byte_end])
elif val.tag == value_e.StrArray: # Slice array entries.
# NOTE: unset elements don't count towards the length
# NOTE: unset elements don't count towards the length.
strs = []
for s in val.strs[begin:]:
if s is not None:
strs.append(s)
if len(strs) == length: # never true for unspecified length
if len(strs) == length: # never true for unspecified length
break
val = runtime.StrArray(strs)
View
@@ -44,7 +44,8 @@ error: Incomplete utf-8
error: Incomplete utf-8
7
## END
## BUG bash STDOUT:
# zsh behavior actually matches bash!
## BUG bash/zsh STDOUT:
0
1
2
@@ -99,7 +100,7 @@ error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
## END
## BUG bash STDOUT:
## BUG bash/zsh STDOUT:
1
2
3
@@ -168,6 +169,9 @@ echo ${#v:1:3}
## OK osh status: 2
## N-I dash status: 0
## N-I dash stdout: 5
# zsh actually implements this!
## OK zsh stdout: 3
## OK zsh status: 0
#### Pattern replacement
v=abcde
@@ -177,18 +181,18 @@ echo ${v/c*/XX}
## N-I dash stdout-json: ""
#### Pattern replacement on unset variable
echo [${v/x/y}]
echo -${v/x/y}-
echo status=$?
set -o nounset # make sure this fails
echo [${v/x/y}]
echo -${v/x/y}-
## STDOUT:
[]
--
status=0
## BUG mksh STDOUT:
# patsub disrespects nounset!
[]
--
status=0
[]
--
## status: 1
## BUG mksh status: 0
## N-I dash status: 2
@@ -253,8 +257,8 @@ echo status=$?
## stdout-json: ""
## N-I dash status: 2
## N-I dash stdout-json: ""
## BUG bash/mksh status: 0
## BUG bash/mksh stdout-json: "-abcde-\nstatus=0\n"
## BUG bash/mksh/zsh status: 0
## BUG bash/mksh/zsh stdout-json: "-abcde-\nstatus=0\n"
#### Pattern replacement ${v//} is not valid
v='a/b/c'
@@ -264,8 +268,8 @@ echo status=$?
## stdout-json: ""
## N-I dash status: 2
## N-I dash stdout-json: ""
## BUG bash/mksh status: 0
## BUG bash/mksh stdout-json: "-a/b/c-\nstatus=0\n"
## BUG bash/mksh/zsh status: 0
## BUG bash/mksh/zsh stdout-json: "-a/b/c-\nstatus=0\n"
#### ${v/a} is the same as ${v/a/} -- no replacement string
v='aabb'
@@ -278,17 +282,54 @@ echo status=$?
#### String slice
foo=abcdefg
echo ${foo:1:3}
## stdout: bcd
## STDOUT:
bcd
## END
## N-I dash status: 2
## N-I dash stdout-json: ""
#### Out of range string slice: begin
# out of range begin doesn't raise error in bash, but in mksh it skips the
# whole thing!
foo=abcdefg
echo _${foo:100:3}
echo $?
## STDOUT:
_
0
## END
## BUG mksh stdout-json: "\n0\n"
## N-I dash status: 2
## N-I dash stdout-json: ""
#### Negative string slice
#### Out of range string slice: length
# OK in both bash and mksh
foo=abcdefg
echo _${foo:3:100}
echo $?
## STDOUT:
_defg
0
## END
## BUG mksh stdout-json: "_defg\n0\n"
## N-I dash status: 2
## N-I dash stdout-json: ""
#### String slice: negative begin
foo=abcdefg
echo ${foo: -4:3}
## stdout: def
## N-I dash status: 2
## N-I dash stdout-json: ""
#### String slice: negative second arg is position, not length
foo=abcdefg
echo ${foo:3:-1} ${foo: 3: -2} ${foo:3 :-3 }
## stdout: def de d
## BUG mksh stdout: defg defg defg
## N-I dash status: 2
## N-I dash stdout-json: ""
#### String slice with math
# I think this is the $(()) language inside?
i=1
@@ -298,11 +339,26 @@ echo ${foo: i-3-2 : i + 2}
## N-I dash status: 2
## N-I dash stdout-json: ""
#### Slice String with Unicode
#### Slice UTF-8 String
# mksh slices by bytes.
foo='--μ--'
echo ${foo:1:3}
## stdout: -μ-
## BUG mksh stdout: -μ
## N-I dash status: 2
## N-I dash stdout-json: ""
#### Slice UTF-8 string with invalid data
# mksh slices by bytes.
s=$(echo -e "\xFF")bcdef
echo ${s:1:3}
## status: 1
## stdout-json: ""
## stderr-json: "error: Invalid start of utf-8 char"
## BUG bash/mksh/zsh status: 0
## BUG bash/mksh/zsh stdout-json: "bcd\n"
## BUG bash/mksh/zsh stderr-json: ""
## N-I dash status: 2
## N-I dash stdout-json: ""
## N-I dash stderr-json: "_tmp/spec-bin/dash: 2: Bad substitution\n"
View
@@ -454,7 +454,7 @@ var-op-test() {
var-op-other() {
sh-spec spec/var-op-other.test.sh --osh-failures-allowed 1 \
${REF_SHELLS[@]} $OSH "$@"
${REF_SHELLS[@]} $ZSH $OSH "$@"
}
var-op-strip() {

0 comments on commit 0f719de

Please sign in to comment.