Permalink
Browse files

Error handling for slices in OSH.

- For string slices, the start index and length can't be negative.
  It's possible to implement negative indices, but the semantics are
  awkward, and I'd like to see which scripts use them.
- 'set -o strict-word-eval' controls whether this is a fatal error or a
  warning + empty slice result.
- Change UTF-8 error handling to use an exception in core/util.py.
- Change UTF-8 error message text.
- Add a test case for strict-word-eval and invalid UTF-8 when slicing
  strings.

Still TODO: Properly handle invalid UTF-8 when taking the string length.

Addresses issue #142.
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 17, 2018
1 parent 81a0851 commit 132cee4e07f632f8070b3c2a56f9330905503e36
Showing with 118 additions and 52 deletions.
  1. +6 −11 core/libstr.py
  2. +12 −0 core/util.py
  3. +37 −12 core/word_eval.py
  4. +4 −0 spec/osh-only.test.sh
  5. +58 −29 spec/var-op-other.test.sh
  6. +1 −0 test/sh_spec.py
View
@@ -56,19 +56,14 @@ def Utf8Encode(code):
return ''.join(chr(b & 0xFF) for b in bytes_)
INCOMPLETE_CHAR = 'error: Incomplete utf-8'
INVALID_CONT = 'error: Invalid utf-8 continuation byte'
INVALID_START = 'error: Invalid start of utf-8 char'
class InvalidUtf8(Exception):
def __init__(self, msg):
self.msg = msg
INCOMPLETE_CHAR = 'Incomplete UTF-8 character'
INVALID_CONT = 'Invalid UTF-8 continuation byte'
INVALID_START = 'Invalid start of UTF-8 character'
def _CheckContinuationByte(byte):
if (ord(byte) >> 6) != 0b10:
raise InvalidUtf8(INVALID_CONT)
raise util.InvalidUtf8(INVALID_CONT)
def _NextUtf8Char(s, i):
@@ -94,9 +89,9 @@ def _NextUtf8Char(s, i):
_CheckContinuationByte(s[i+3])
i += 4
else:
raise InvalidUtf8(INVALID_START)
raise util.InvalidUtf8(INVALID_START)
except IndexError:
raise InvalidUtf8(INCOMPLETE_CHAR)
raise util.InvalidUtf8(INCOMPLETE_CHAR)
return i
View
@@ -82,6 +82,18 @@ class FatalRuntimeError(_ErrorWithLocation):
pass
class InvalidSlice(FatalRuntimeError):
"""Whether this is fatal depends on set -o strict-word-eval.
"""
pass
class InvalidUtf8(FatalRuntimeError):
"""Whether this is fatal depends on set -o strict-word-eval.
"""
pass
class ErrExitFailure(FatalRuntimeError):
"""For set -e.
View
@@ -306,7 +306,7 @@ def _ApplyPrefixOp(self, val, op_id):
# https://stackoverflow.com/questions/17368067/length-of-string-in-bash
try:
length = libstr.CountUtf8Chars(val.s)
except libstr.InvalidUtf8 as e:
except util.InvalidUtf8 as e:
# EARLY RETURN. TODO: Should print to stderr!
return runtime.Str(str(e.msg))
@@ -598,21 +598,46 @@ def _EvalBracedVarSub(self, part, part_vals, quoted):
if val.tag == value_e.Str: # Slice UTF-8 characters in a string.
s = val.s
if begin < 0:
# How do we count characters from the end? I guess we have to
# decode the whole thing.
raise NotImplementedError
try:
if begin < 0:
# It could be negative if we compute unicode length, but that's
# confusing.
# TODO: Instead of attributing it to the word part, it would be
# better if we attributed it to arith_expr begin.
raise util.InvalidSlice(
"The start index of a string slice can't be negative: %d",
begin, part=part)
byte_begin = libstr.AdvanceUtf8Chars(s, begin, 0)
byte_begin = libstr.AdvanceUtf8Chars(s, begin, 0)
if length is None:
byte_end = len(s)
if length is None:
byte_end = len(s)
else:
if length < 0:
# TODO: Instead of attributing it to the word part, it would be
# better if we attributed it to arith_expr begin.
raise util.InvalidSlice(
"The length of a string slice can't be negative: %d",
length, part=part)
byte_end = libstr.AdvanceUtf8Chars(s, length, byte_begin)
except (util.InvalidSlice, util.InvalidUtf8) as e:
if self.exec_opts.strict_word_eval:
raise
else:
# TODO:
# - We don't see the error location here, but we see it when set
# -o strict-word-eval.
# - Doesn't make the command exit with 1. It just sets the word
# to empty string.
util.warn(e.UserErrorString())
substr = '' # error condition
else:
if length < 0:
raise NotImplementedError
byte_end = libstr.AdvanceUtf8Chars(s, length, byte_begin)
substr = s[byte_begin : byte_end]
val = runtime.Str(s[byte_begin : byte_end])
val = runtime.Str(substr)
elif val.tag == value_e.StrArray: # Slice array entries.
# NOTE: unset elements don't count towards the length.
View
@@ -11,3 +11,7 @@ set -o debug-completion
#### debug-completion from command line
$SH -o debug-completion
## status: 0
# NOTE: strict-arith has one case in arith.test.sh), strict-word-eval has a case in var-op-other.
View
@@ -32,16 +32,16 @@ done
0
1
2
error: Incomplete utf-8
Incomplete UTF-8 character
3
4
error: Incomplete utf-8
error: Incomplete utf-8
Incomplete UTF-8 character
Incomplete UTF-8 character
5
6
error: Incomplete utf-8
error: Incomplete utf-8
error: Incomplete utf-8
Incomplete UTF-8 character
Incomplete UTF-8 character
Incomplete UTF-8 character
7
## END
# zsh behavior actually matches bash!
@@ -84,21 +84,21 @@ for num_bytes in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
echo ${#s}
done
## STDOUT:
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid utf-8 continuation byte
error: Invalid start of utf-8 char
error: Invalid start of utf-8 char
Invalid start of UTF-8 character
Invalid start of UTF-8 character
Invalid start of UTF-8 character
Invalid UTF-8 continuation byte
Invalid start of UTF-8 character
Invalid start of UTF-8 character
Invalid UTF-8 continuation byte
Invalid UTF-8 continuation byte
Invalid start of UTF-8 character
Invalid start of UTF-8 character
Invalid UTF-8 continuation byte
Invalid UTF-8 continuation byte
Invalid UTF-8 continuation byte
Invalid start of UTF-8 character
Invalid start of UTF-8 character
## END
## BUG bash/zsh STDOUT:
1
@@ -318,23 +318,39 @@ _defg
#### String slice: negative begin
foo=abcdefg
echo ${foo: -4:3}
## OK osh stdout:
## stdout: def
## N-I dash status: 2
## N-I dash stdout-json: ""
#### String slice: negative second arg is position, not length
foo=abcdefg
echo ${foo:3:-1} ${foo: 3: -2} ${foo:3 :-3 }
## OK osh stdout:
## stdout: def de d
## BUG mksh stdout: defg defg defg
## N-I dash status: 2
## N-I dash stdout-json: ""
#### strict-word-eval with string slice
set -o strict-word-eval || true
echo slice
s='abc'
echo -${s: -2}-
## stdout-json: "slice\n"
## status: 1
## N-I bash status: 0
## N-I bash stdout-json: "slice\n-bc-\n"
## N-I dash status: 2
## N-I dash stdout-json: ""
## N-I mksh/zsh status: 1
## N-I mksh/zsh stdout-json: ""
#### String slice with math
# I think this is the $(()) language inside?
i=1
foo=abcdefg
echo ${foo: i-3-2 : i + 2}
echo ${foo: i+4-2 : i + 2}
## stdout: def
## N-I dash status: 2
## N-I dash stdout-json: ""
@@ -367,17 +383,30 @@ echo ${foo:1:3}
## N-I dash status: 2
## N-I dash stdout-json: ""
#### Slice UTF-8 string with invalid data
# mksh slices by bytes.
#### Slice string with invalid UTF-8 results in empty string and warning
s=$(echo -e "\xFF")bcdef
echo ${s:1:3}
## status: 1
## stdout-json: ""
## stderr-json: "error: Invalid start of utf-8 char"
echo -${s:1:3}-
## status: 0
## stdout-json: "--\n"
## stderr-json: "osh warning: Invalid start of UTF-8 character\n"
## BUG bash/mksh/zsh status: 0
## BUG bash/mksh/zsh stdout-json: "bcd\n"
## BUG bash/mksh/zsh stdout-json: "-bcd-\n"
## BUG bash/mksh/zsh stderr-json: ""
## N-I dash status: 2
## N-I dash stdout-json: ""
## N-I dash stderr-json: "_tmp/spec-bin/dash: 2: Bad substitution\n"
#### Slice string with invalid UTF-8 with strict-word-eval
set -o strict-word-eval || true
echo slice
s=$(echo -e "\xFF")bcdef
echo -${s:1:3}-
## status: 1
## stdout-json: "slice\n"
## N-I mksh/zsh status: 1
## N-I mksh/zsh stdout-json: ""
## N-I dash status: 2
## N-I dash stdout-json: ""
## N-I bash status: 0
## N-I bash stdout-json: "slice\n-bcd-\n"
View
@@ -14,6 +14,7 @@
Results:
PASS - we got the ideal, expected value
OK - we got a value that was not ideal, but expected
For OSH this is behavior that was defined to be different?
N-I - Not implemented (e.g. $''). Assertions still checked (in case it
starts working)
BUG - we verified the value of a known bug

0 comments on commit 132cee4

Please sign in to comment.