From 4aab5cb3cd1776b6ac431eaabe7a0180c3227bc6 Mon Sep 17 00:00:00 2001 From: Andy Chu Date: Tue, 21 May 2024 22:53:12 -0400 Subject: [PATCH] [unicode] Check for code points above 0x10ffff at parse time Still need more checks for J8, dynamic parsing, etc. Also need the surrogate range check. --- data_lang/j8.py | 1 + osh/word_compile.py | 33 ++++++++++++++++++++++++++++----- osh/word_parse.py | 2 +- spec/unicode.test.sh | 29 ++++++++++++++++++++++------- spec/ysh-unicode.test.sh | 9 ++++++--- ysh/expr_parse.py | 2 +- 6 files changed, 59 insertions(+), 17 deletions(-) diff --git a/data_lang/j8.py b/data_lang/j8.py index 4e62bf17f..3a315f046 100644 --- a/data_lang/j8.py +++ b/data_lang/j8.py @@ -126,6 +126,7 @@ def Utf8Encode(code): num_cont_bytes = 3 else: + # TODO: Assertion error return '\xEF\xBF\xBD' # unicode replacement character bytes_ = [] # type: List[int] diff --git a/osh/word_compile.py b/osh/word_compile.py index f550b7486..0ce8a547a 100644 --- a/osh/word_compile.py +++ b/osh/word_compile.py @@ -13,6 +13,7 @@ word_part_e, word_part_t, ) +from core.error import p_die from data_lang import j8 from frontend import consts from frontend import lexer @@ -62,9 +63,10 @@ def EvalCharLiteralForRegex(tok): def EvalCStringToken(id_, value): # type: (Id_t, str) -> Optional[str] - """This function is shared between echo -e and $''. - - $'' could use it at compile time, much like brace expansion in braces.py. + """All types of C-style backslash-escaped strings use this function: + + - echo -e and printf at runtime + - $'' and b'' u'' at parse time """ if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash): # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH. @@ -114,7 +116,7 @@ def EvalCStringToken(id_, value): raise AssertionError(Id_str(id_)) -def EvalSingleQuoted2(id_, tokens): +def EvalSingleQuoted(id_, tokens): # type: (Id_t, List[Token]) -> str """ Done at parse time """ if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote, @@ -128,7 +130,25 @@ def EvalSingleQuoted2(id_, tokens): for t in tokens: print('T %s' % t) - strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens] + strs = [] + for t in tokens: + # More parse time validation for code points. + # EvalCStringToken() redoes some of this work, but right now it's + # shared with dynamic echo -e / printf, which don't have tokens. + + if t.id == Id.Char_Unicode8: # check for invalid \U00110000 + s = lexer.TokenSliceLeft(t, 2) + i = int(s, 16) + if i > 0x10ffff: + p_die("Code point can't be greater than U+10ffff", t) + + elif t.id == Id.Char_UBraced: # check for invalid \u{110000} + s = lexer.TokenSlice(t, 3, -1) + i = int(s, 16) + if i > 0x10ffff: + p_die("Code point can't be greater than U+10ffff", t) + + strs.append(EvalCStringToken(t.id, lexer.TokenVal(t))) else: raise AssertionError(id_) @@ -278,3 +298,6 @@ def RemoveLeadingSpaceSQ(tokens): tok.id = Id.Lit_CharsWithoutPrefix #log('STRIP tok %s', tok) + + +# vim: sw=4 diff --git a/osh/word_parse.py b/osh/word_parse.py index a718f7acb..051e3c4bd 100644 --- a/osh/word_parse.py +++ b/osh/word_parse.py @@ -636,7 +636,7 @@ def _ReadSingleQuoted(self, left_token, lex_mode): # In command mode, we never disallow backslashes like '\' right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens, False) - sval = word_compile.EvalSingleQuoted2(left_token.id, tokens) + sval = word_compile.EvalSingleQuoted(left_token.id, tokens) node = SingleQuoted(left_token, sval, right_quote) return node diff --git a/spec/unicode.test.sh b/spec/unicode.test.sh index 60c6541d2..3fb9d650b 100644 --- a/spec/unicode.test.sh +++ b/spec/unicode.test.sh @@ -57,7 +57,7 @@ printf '\u03bc \U000003bc\n' ## N-I dash/ash STDOUT: ## END -#### U+10ffff is max code point +#### Max code point U+10ffff can escaped with $'' printf echo -e case $SH in dash|ash) exit ;; esac @@ -86,7 +86,7 @@ py-repr $(printf '\U0010ffff') '\xf4\x8f\xbf\xbf' ## END -#### 0x00110000 is greater than max code point +#### $'' checks that 0x110000 is too big at parse time case $SH in dash|ash|mksh) exit ;; esac @@ -95,9 +95,6 @@ py-repr() { } py-repr $'\U00110000' -py-repr $(echo -e '\U00110000') -py-repr $(printf '\U00110000') - ## status: 2 ## STDOUT: @@ -108,9 +105,27 @@ py-repr $(printf '\U00110000') ## BUG bash/zsh status: 0 ## BUG bash/zsh STDOUT: '\xf4\x90\x80\x80' -'\xf4\x90\x80\x80' -'\xf4\x90\x80\x80' ## END +#### printf / echo -e check that 0x110000 is too big at runtime +case $SH in mksh) exit ;; esac + +py-repr() { + python2 -c 'import sys; print repr(sys.argv[1])' "$@" +} + +py-repr $(echo -e '\U00110000') +py-repr $(printf '\U00110000') + +## STDOUT: +echo +## END +## BUG bash/zsh STDOUT: +'\xf4\x90\x80\x80' +'\xf4\x90\x80\x80' +## END + +## BUG mksh STDOUT: +## END diff --git a/spec/ysh-unicode.test.sh b/spec/ysh-unicode.test.sh index 841985fe2..14183f698 100644 --- a/spec/ysh-unicode.test.sh +++ b/spec/ysh-unicode.test.sh @@ -1,4 +1,4 @@ -## oils_failures_allowed: 3 +## oils_failures_allowed: 2 #### ${#s} and len(s) @@ -81,17 +81,20 @@ echo status too_big=$? # python2 -c 'import sys; c = sys.argv[1].decode("utf-8"); print len(c)' "$too_big" var max = u'\u{10ffff}' -var too_big = u'\u{110000}' +pp line (max) -echo 'should not get here' +var too_big = u'\u{110000}' +pp line (too_big) # should not get here # These are errors too var max = b'\u{10ffff}' var too_big = b'\u{110000}' +## status: 2 ## STDOUT: status max=0 status too_big=1 +(Str) "􏿿" ## END diff --git a/ysh/expr_parse.py b/ysh/expr_parse.py index ba01e4b40..1b6d232fe 100644 --- a/ysh/expr_parse.py +++ b/ysh/expr_parse.py @@ -311,7 +311,7 @@ def _PushYshTokens(parse_ctx, gr, p, lex): last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens, True) - sval = word_compile.EvalSingleQuoted2(left_token.id, tokens) + sval = word_compile.EvalSingleQuoted(left_token.id, tokens) sq_part = SingleQuoted(left_token, sval, last_token) typ = Id.Expr_CastedDummy