[unicode] Check for code points above 0x10ffff at parse time

Still need more checks for J8, dynamic parsing, etc. Also need the surrogate range check.
oilshell · May 22, 2024 · 4aab5cb · 4aab5cb
1 parent a0b80c7
commit 4aab5cb
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 17 deletions.
diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -126,6 +126,7 @@ def Utf8Encode(code):
         num_cont_bytes = 3
 
     else:
+        # TODO: Assertion error
         return '\xEF\xBF\xBD'  # unicode replacement character
 
     bytes_ = []  # type: List[int]

diff --git a/osh/word_compile.py b/osh/word_compile.py
@@ -13,6 +13,7 @@
     word_part_e,
     word_part_t,
 )
+from core.error import p_die
 from data_lang import j8
 from frontend import consts
 from frontend import lexer
@@ -62,9 +63,10 @@ def EvalCharLiteralForRegex(tok):
 
 def EvalCStringToken(id_, value):
     # type: (Id_t, str) -> Optional[str]
-    """This function is shared between echo -e and $''.
-
-    $'' could use it at compile time, much like brace expansion in braces.py.
+    """All types of C-style backslash-escaped strings use this function:
+    
+    - echo -e and printf at runtime
+    - $'' and b'' u'' at parse time
     """
     if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
         # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
@@ -114,7 +116,7 @@ def EvalCStringToken(id_, value):
         raise AssertionError(Id_str(id_))
 
 
-def EvalSingleQuoted2(id_, tokens):
+def EvalSingleQuoted(id_, tokens):
     # type: (Id_t, List[Token]) -> str
     """ Done at parse time """
     if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
@@ -128,7 +130,25 @@ def EvalSingleQuoted2(id_, tokens):
             for t in tokens:
                 print('T %s' % t)
 
-        strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
+        strs = []
+        for t in tokens:
+            # More parse time validation for code points.
+            # EvalCStringToken() redoes some of this work, but right now it's
+            # shared with dynamic echo -e / printf, which don't have tokens.
+
+            if t.id == Id.Char_Unicode8:  # check for invalid \U00110000
+                s = lexer.TokenSliceLeft(t, 2)
+                i = int(s, 16)
+                if i > 0x10ffff:
+                    p_die("Code point can't be greater than U+10ffff", t)
+
+            elif t.id == Id.Char_UBraced:  # check for invalid \u{110000}
+                s = lexer.TokenSlice(t, 3, -1)
+                i = int(s, 16)
+                if i > 0x10ffff:
+                    p_die("Code point can't be greater than U+10ffff", t)
+
+            strs.append(EvalCStringToken(t.id, lexer.TokenVal(t)))
 
     else:
         raise AssertionError(id_)
@@ -278,3 +298,6 @@ def RemoveLeadingSpaceSQ(tokens):
             tok.id = Id.Lit_CharsWithoutPrefix
 
             #log('STRIP tok %s', tok)
+
+
+# vim: sw=4
diff --git a/osh/word_parse.py b/osh/word_parse.py
@@ -636,7 +636,7 @@ def _ReadSingleQuoted(self, left_token, lex_mode):
         # In command mode, we never disallow backslashes like '\'
         right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
                                             False)
-        sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
+        sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
         node = SingleQuoted(left_token, sval, right_quote)
         return node
 

diff --git a/spec/unicode.test.sh b/spec/unicode.test.sh
@@ -57,7 +57,7 @@ printf '\u03bc \U000003bc\n'
 ## N-I dash/ash STDOUT:
 ## END
 
-#### U+10ffff is max code point
+#### Max code point U+10ffff can escaped with $''  printf  echo -e
 
 case $SH in dash|ash) exit ;; esac
 
@@ -86,7 +86,7 @@ py-repr $(printf '\U0010ffff')
 '\xf4\x8f\xbf\xbf'
 ## END
 
-#### 0x00110000 is greater than max code point
+#### $'' checks that 0x110000 is too big at parse time
 
 case $SH in dash|ash|mksh) exit ;; esac
 
@@ -95,9 +95,6 @@ py-repr() {
 }
 
 py-repr $'\U00110000'
-py-repr $(echo -e '\U00110000')
-py-repr $(printf '\U00110000')
-
 
 ## status: 2
 ## STDOUT:
@@ -108,9 +105,27 @@ py-repr $(printf '\U00110000')
 ## BUG bash/zsh status: 0
 ## BUG bash/zsh STDOUT:
 '\xf4\x90\x80\x80'
-'\xf4\x90\x80\x80'
-'\xf4\x90\x80\x80'
 ## END
 
 
+#### printf / echo -e check that 0x110000 is too big at runtime
+case $SH in mksh) exit ;; esac
+
+py-repr() {
+  python2 -c 'import sys; print repr(sys.argv[1])'  "$@"
+}
+
+py-repr $(echo -e '\U00110000')
+py-repr $(printf '\U00110000')
+
+## STDOUT:
+echo
+## END
 
+## BUG bash/zsh STDOUT:
+'\xf4\x90\x80\x80'
+'\xf4\x90\x80\x80'
+## END
+
+## BUG mksh STDOUT:
+## END
diff --git a/spec/ysh-unicode.test.sh b/spec/ysh-unicode.test.sh
@@ -1,4 +1,4 @@
-## oils_failures_allowed: 3
+## oils_failures_allowed: 2
 
 #### ${#s} and len(s)
 
@@ -81,17 +81,20 @@ echo status too_big=$?
 # python2 -c 'import sys; c = sys.argv[1].decode("utf-8"); print len(c)' "$too_big"
 
 var max = u'\u{10ffff}'
-var too_big = u'\u{110000}'
+pp line (max)
 
-echo 'should not get here'
+var too_big = u'\u{110000}'
+pp line (too_big)  # should not get here
 
 # These are errors too
 var max = b'\u{10ffff}'
 var too_big = b'\u{110000}'
 
+## status: 2
 ## STDOUT:
 status max=0
 status too_big=1
+(Str)   "􏿿"
 ## END
 
 

diff --git a/ysh/expr_parse.py b/ysh/expr_parse.py
@@ -311,7 +311,7 @@ def _PushYshTokens(parse_ctx, gr, p, lex):
             last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
                                                    True)
 
-            sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
+            sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
             sq_part = SingleQuoted(left_token, sval, last_token)
 
             typ = Id.Expr_CastedDummy