Skip to content

Commit

Permalink
[osh] Don't do code point checks for echo -e and printf
Browse files Browse the repository at this point in the history
Because bash and zsh don't, and it can be a useful escape hatch if you
want to unusual calculations in YSH.

We also don't have good error locations.
  • Loading branch information
Andy Chu committed May 22, 2024
1 parent 6960ac9 commit 76fea02
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 38 deletions.
9 changes: 4 additions & 5 deletions data_lang/j8.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,11 @@ def Utf8Encode(code):
num_cont_bytes = 1
elif code <= 0xFFFF:
num_cont_bytes = 2
elif code <= 0x10FFFF:
num_cont_bytes = 3

else:
# TODO: Assertion error
return '\xEF\xBF\xBD' # unicode replacement character
# What about the check code <= 0x10FFFF ?
# - it happens in statically parsed $'' u''
# - but not dynamically parsed echo -e / printf, following bash/zsh
num_cont_bytes = 3

bytes_ = [] # type: List[int]
for _ in xrange(num_cont_bytes):
Expand Down
21 changes: 8 additions & 13 deletions osh/word_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,30 +104,25 @@ def EvalCStringToken(id_, value):
i = int(s, 16)
return chr(i)

# Note: we're not doing the surrogate range and max code point checks for
# echo -e and printf:
#
# 1. It's not compatible with bash
# 2. We don't have good error locations anyway

elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
s = value[2:]
code_point = int(s, 16)

# Keep going
return j8.Utf8Encode(code_point)

elif id_ == Id.Char_UBraced:
s = value[3:-1] # \u{123}
code_point = int(s, 16)
return j8.Utf8Encode(code_point)

else:
raise AssertionError(Id_str(id_))

# These checks are redundant for $'' u'' because we already checked at
# parse time. But we need them for echo -e / printf.
if code_point > 0x10ffff:
e_die("Code point can't be greater than U+10ffff", loc.Missing)
if 0xD800 <= code_point and code_point < 0xE000:
e_die(
r"Code point is illegal because it's in the surrogate range",
loc.Missing)

return j8.Utf8Encode(code_point)


def EvalSingleQuoted(id_, tokens):
# type: (Id_t, List[Token]) -> str
Expand Down
22 changes: 2 additions & 20 deletions spec/unicode.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ fail
## END


#### printf / echo -e check that 0x110000 is too big at runtime
#### printf / echo -e do NOT check max code point at runtime
case $SH in mksh) exit ;; esac

py-repr() {
Expand All @@ -160,13 +160,6 @@ echo status=$?
py-repr "$p"

## STDOUT:
status=1
''
status=1
''
## END

## BUG bash/zsh STDOUT:
status=0
'\xf4\x90\x80\x80'
status=0
Expand All @@ -176,7 +169,7 @@ status=0
## BUG mksh STDOUT:
## END

#### printf / echo -e check surrogates at runtime
#### printf / echo -e do NOT check surrogates at runtime
case $SH in mksh) exit ;; esac

py-repr() {
Expand All @@ -200,17 +193,6 @@ echo status=$?
py-repr "$p"

## STDOUT:
status=1
''
status=1
''
status=1
''
status=1
''
## END

## BUG bash STDOUT:
status=0
'\xed\xb0\x80'
status=0
Expand Down

0 comments on commit 76fea02

Please sign in to comment.