Skip to content

Commit

Permalink
[unicode] Check for code points above 0x10ffff at parse time
Browse files Browse the repository at this point in the history
Still need more checks for J8, dynamic parsing, etc.

Also need the surrogate range check.
  • Loading branch information
Andy Chu committed May 22, 2024
1 parent a0b80c7 commit 4aab5cb
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 17 deletions.
1 change: 1 addition & 0 deletions data_lang/j8.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def Utf8Encode(code):
num_cont_bytes = 3

else:
# TODO: Assertion error
return '\xEF\xBF\xBD' # unicode replacement character

bytes_ = [] # type: List[int]
Expand Down
33 changes: 28 additions & 5 deletions osh/word_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
word_part_e,
word_part_t,
)
from core.error import p_die
from data_lang import j8
from frontend import consts
from frontend import lexer
Expand Down Expand Up @@ -62,9 +63,10 @@ def EvalCharLiteralForRegex(tok):

def EvalCStringToken(id_, value):
# type: (Id_t, str) -> Optional[str]
"""This function is shared between echo -e and $''.
$'' could use it at compile time, much like brace expansion in braces.py.
"""All types of C-style backslash-escaped strings use this function:
- echo -e and printf at runtime
- $'' and b'' u'' at parse time
"""
if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
Expand Down Expand Up @@ -114,7 +116,7 @@ def EvalCStringToken(id_, value):
raise AssertionError(Id_str(id_))


def EvalSingleQuoted2(id_, tokens):
def EvalSingleQuoted(id_, tokens):
# type: (Id_t, List[Token]) -> str
""" Done at parse time """
if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
Expand All @@ -128,7 +130,25 @@ def EvalSingleQuoted2(id_, tokens):
for t in tokens:
print('T %s' % t)

strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
strs = []
for t in tokens:
# More parse time validation for code points.
# EvalCStringToken() redoes some of this work, but right now it's
# shared with dynamic echo -e / printf, which don't have tokens.

if t.id == Id.Char_Unicode8: # check for invalid \U00110000
s = lexer.TokenSliceLeft(t, 2)
i = int(s, 16)
if i > 0x10ffff:
p_die("Code point can't be greater than U+10ffff", t)

elif t.id == Id.Char_UBraced: # check for invalid \u{110000}
s = lexer.TokenSlice(t, 3, -1)
i = int(s, 16)
if i > 0x10ffff:
p_die("Code point can't be greater than U+10ffff", t)

strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))

else:
raise AssertionError(id_)
Expand Down Expand Up @@ -278,3 +298,6 @@ def RemoveLeadingSpaceSQ(tokens):
tok.id = Id.Lit_CharsWithoutPrefix

#log('STRIP tok %s', tok)


# vim: sw=4
2 changes: 1 addition & 1 deletion osh/word_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def _ReadSingleQuoted(self, left_token, lex_mode):
# In command mode, we never disallow backslashes like '\'
right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
False)
sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
node = SingleQuoted(left_token, sval, right_quote)
return node

Expand Down
29 changes: 22 additions & 7 deletions spec/unicode.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ printf '\u03bc \U000003bc\n'
## N-I dash/ash STDOUT:
## END

#### U+10ffff is max code point
#### Max code point U+10ffff can escaped with $'' printf echo -e

case $SH in dash|ash) exit ;; esac

Expand Down Expand Up @@ -86,7 +86,7 @@ py-repr $(printf '\U0010ffff')
'\xf4\x8f\xbf\xbf'
## END

#### 0x00110000 is greater than max code point
#### $'' checks that 0x110000 is too big at parse time

case $SH in dash|ash|mksh) exit ;; esac

Expand All @@ -95,9 +95,6 @@ py-repr() {
}

py-repr $'\U00110000'
py-repr $(echo -e '\U00110000')
py-repr $(printf '\U00110000')


## status: 2
## STDOUT:
Expand All @@ -108,9 +105,27 @@ py-repr $(printf '\U00110000')
## BUG bash/zsh status: 0
## BUG bash/zsh STDOUT:
'\xf4\x90\x80\x80'
'\xf4\x90\x80\x80'
'\xf4\x90\x80\x80'
## END


#### printf / echo -e check that 0x110000 is too big at runtime
case $SH in mksh) exit ;; esac

py-repr() {
python2 -c 'import sys; print repr(sys.argv[1])' "$@"
}

py-repr $(echo -e '\U00110000')
py-repr $(printf '\U00110000')

## STDOUT:
echo
## END

## BUG bash/zsh STDOUT:
'\xf4\x90\x80\x80'
'\xf4\x90\x80\x80'
## END

## BUG mksh STDOUT:
## END
9 changes: 6 additions & 3 deletions spec/ysh-unicode.test.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## oils_failures_allowed: 3
## oils_failures_allowed: 2

#### ${#s} and len(s)

Expand Down Expand Up @@ -81,17 +81,20 @@ echo status too_big=$?
# python2 -c 'import sys; c = sys.argv[1].decode("utf-8"); print len(c)' "$too_big"

var max = u'\u{10ffff}'
var too_big = u'\u{110000}'
pp line (max)

echo 'should not get here'
var too_big = u'\u{110000}'
pp line (too_big) # should not get here

# These are errors too
var max = b'\u{10ffff}'
var too_big = b'\u{110000}'

## status: 2
## STDOUT:
status max=0
status too_big=1
(Str) "􏿿"
## END


Expand Down
2 changes: 1 addition & 1 deletion ysh/expr_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def _PushYshTokens(parse_ctx, gr, p, lex):
last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
True)

sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
sq_part = SingleQuoted(left_token, sval, last_token)

typ = Id.Expr_CastedDummy
Expand Down

0 comments on commit 4aab5cb

Please sign in to comment.