[osh] Don't do code point checks for echo -e and printf

Because bash and zsh don't, and it can be a useful escape hatch if you want to unusual calculations in YSH. We also don't have good error locations.
oils-for-unix · May 22, 2024 · 76fea02 · 76fea02
1 parent 6960ac9
commit 76fea02
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 38 deletions.
diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -122,12 +122,11 @@ def Utf8Encode(code):
         num_cont_bytes = 1
     elif code <= 0xFFFF:
         num_cont_bytes = 2
-    elif code <= 0x10FFFF:
-        num_cont_bytes = 3
-
     else:
-        # TODO: Assertion error
-        return '\xEF\xBF\xBD'  # unicode replacement character
+        # What about the check code <= 0x10FFFF ?
+        # - it happens in statically parsed $'' u''
+        # - but not dynamically parsed echo -e / printf, following bash/zsh
+        num_cont_bytes = 3
 
     bytes_ = []  # type: List[int]
     for _ in xrange(num_cont_bytes):

diff --git a/osh/word_compile.py b/osh/word_compile.py
@@ -104,30 +104,25 @@ def EvalCStringToken(id_, value):
         i = int(s, 16)
         return chr(i)
 
+    # Note: we're not doing the surrogate range and max code point checks for
+    # echo -e and printf:
+    #
+    # 1. It's not compatible with bash
+    # 2. We don't have good error locations anyway
+
     elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
         s = value[2:]
         code_point = int(s, 16)
-
-        # Keep going
+        return j8.Utf8Encode(code_point)
 
     elif id_ == Id.Char_UBraced:
         s = value[3:-1]  # \u{123}
         code_point = int(s, 16)
+        return j8.Utf8Encode(code_point)
 
     else:
         raise AssertionError(Id_str(id_))
 
-    # These checks are redundant for $'' u'' because we already checked at
-    # parse time.  But we need them for echo -e / printf.
-    if code_point > 0x10ffff:
-        e_die("Code point can't be greater than U+10ffff", loc.Missing)
-    if 0xD800 <= code_point and code_point < 0xE000:
-        e_die(
-            r"Code point is illegal because it's in the surrogate range",
-            loc.Missing)
-
-    return j8.Utf8Encode(code_point)
-
 
 def EvalSingleQuoted(id_, tokens):
     # type: (Id_t, List[Token]) -> str

diff --git a/spec/unicode.test.sh b/spec/unicode.test.sh
@@ -144,7 +144,7 @@ fail
 ## END
 
 
-#### printf / echo -e check that 0x110000 is too big at runtime
+#### printf / echo -e do NOT check max code point at runtime
 case $SH in mksh) exit ;; esac
 
 py-repr() {
@@ -160,13 +160,6 @@ echo status=$?
 py-repr "$p"
 
 ## STDOUT:
-status=1
-''
-status=1
-''
-## END
-
-## BUG bash/zsh STDOUT:
 status=0
 '\xf4\x90\x80\x80'
 status=0
@@ -176,7 +169,7 @@ status=0
 ## BUG mksh STDOUT:
 ## END
 
-#### printf / echo -e check surrogates at runtime
+#### printf / echo -e do NOT check surrogates at runtime
 case $SH in mksh) exit ;; esac
 
 py-repr() {
@@ -200,17 +193,6 @@ echo status=$?
 py-repr "$p"
 
 ## STDOUT:
-status=1
-''
-status=1
-''
-status=1
-''
-status=1
-''
-## END
-
-## BUG bash STDOUT:
 status=0
 '\xed\xb0\x80'
 status=0