[ysh breaking] @() decodes J8 lines, instead of doing IFS splitting

This is YSH style! Invariant: any argv array can be represented. This is not true with IFS! You would have to change this mutable global variable to represent any string, which leads to data-dependent bugs.
oilshell · Apr 19, 2024 · 20d2da4 · 20d2da4
1 parent 0144322
commit 20d2da4
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 89 deletions.
diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -583,7 +583,7 @@ def Next(self):
         self.pos = end_pos
         return tok_id, end_pos, None
 
-    def J8LinesNext(self):
+    def NextForLines(self):
         # type: () -> Tuple[Id_t, int, Optional[str]]
         """ Like Next(), but for J8 Lines """
 
@@ -734,14 +734,19 @@ def _Next(self):
     def _Eat(self, tok_id):
         # type: (Id_t) -> None
 
-        # TODO: Need location info
         if self.tok_id != tok_id:
             #log('position %r %d-%d %r', self.s, self.start_pos,
             #    self.end_pos, self.s[self.start_pos:self.end_pos])
             raise self._Error("Expected %s, got %s" %
                               (Id_str(tok_id), Id_str(self.tok_id)))
         self._Next()
 
+    def _NextForLines(self):
+        # type: () -> None
+        """Like _Next, but use the J8 Lines lexer."""
+        self.start_pos = self.end_pos
+        self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
+
     def _Error(self, msg):
         # type: (str) -> error.Decode
         return error.Decode(msg, self.s, self.start_pos, self.end_pos)
@@ -1063,100 +1068,135 @@ def ParseNil8(self):
 
 
 class J8LinesParser(_Parser):
-    """
-    Line-based grammar:
+    """Decode lines from a string with newlines.
+
+    We specify this with a grammar, to preserve location info and to reduce
+    allocations.  (But note that unquoted_line is more like a LOOP than it is
+    grammatical.)
+
+    Grammar:
+
+        end           = Op_Newline | Eol_Tok
+
+        empty_line    = WS_Space? end
 
-        j8_line  = Space? string Space? (Op_Newline | Eof_Real)
-        string = J8_String
-               | (Lit_Chars (Space (Lit_Chars | Left-*) )*
+        # special case: read until end token, but REMOVE trailing WS_Space
+        unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
+
+        j8_line       = WS_Space? J8_String WS_Space? end
+
+        lines         = (empty_line | unquoted_line | j8_line)*
 
     where Lit_Chars is valid UTF-8
 
-    Note that "" and u'' here are not a decoded string, because the line
-    started with literals:
+    Notes:
+
+    (1) We disallow multiple strings on a line, like:
+
+        "json" "json2"
+        "json" unquoted
+
+    (2) Internal quotes are allowed on unquoted lines.  Consider this line:
 
         foo "" u''
 
-    Using the grammar style preserves location info, reduces allocations.
+    The "" and u'' are not a decoded string, because the line started with
+    Id.Lit_Chars literals.
 
-    - TODO: We something like LexerDecoder, but with a J8_LINES mode
-    - _Parser should take the outer lexer.
+    (3) This is related to TSV8?  Similar rules.  Does TSV8 have empty cells?
+        Does it have - for empty cell?
     """
 
     def __init__(self, s):
         # type: (str) -> None
         _Parser.__init__(self, s, True)
 
-    def ParseLine(self):
-        # type: () -> Optional[str]
-        pass
+    def _Show(self, s):
+        # type: (str) -> None
+        log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
+            self.end_pos)
 
-    def _Parse(self):
-        # type: () -> List[str]
+    def _ParseLine(self, out):
+        # type: (List[str]) -> None
+        """ May append a line to 'out' """
+        #self._Show('1')
+        if self.tok_id == Id.WS_Space:
+            self._NextForLines()
 
-        log('tok_id %s', Id_str(self.tok_id))
+        # Empty line - return without doing anything
+        if self.tok_id in (Id.Op_Newline, Id.Eol_Tok):
+            self._NextForLines()
+            return
 
-        if self.tok_id == Id.WS_Space:
-            self._Next()
+        # Quoted string on line
+        if self.tok_id == Id.J8_String:
+            out.append(self.decoded)
+            self._NextForLines()
 
-        if self.tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
-                      Id.Left_USingleQuote):
-            #tok_id, end_pos, s = self._DecodeString(tok_id, end_pos)
-            #log('s %r', s)
-            pass
+            if self.tok_id == Id.WS_Space:  # trailing whitespace
+                self._NextForLines()
 
-        if self.tok_id == Id.WS_Space:
-            self._Next()
-        pass
-        return None
+            if self.tok_id not in (Id.Op_Newline, Id.Eol_Tok):
+                raise self._Error('Unexpected text after J8 Line (%s)' %
+                                  Id_str(self.tok_id))
 
-    def _Show(self, s):
-        # type: (str) -> None
-        log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
-                self.end_pos)
+            self._NextForLines()
+            return
+
+        # Unquoted line
+        if self.tok_id == Id.Lit_Chars:
+            # '  unquoted "" text on line  '   # read every token until end
+            string_start = self.start_pos
+            while True:
+                # for stripping whitespace
+                prev_id = self.tok_id
+                prev_start = self.start_pos
+
+                self._NextForLines()
+                if self.tok_id in (Id.Op_Newline, Id.Eol_Tok):
+                    break
+
+            if prev_id == Id.WS_Space:
+                string_end = prev_start  # remove trailing whitespace
+            else:
+                string_end = self.start_pos
+
+            out.append(self.s[string_start:string_end])
+
+            self._NextForLines()  # past newline
+            return
+
+        raise AssertionError(Id_str(self.tok_id))
 
     def Parse(self):
         # type: () -> List[str]
         """ Raises error.Decode. """
+        self._NextForLines()
 
-        self._Show('1')
-
-        # TODO: J8LinesNext()
-        self._Next()
-        self._Show('2')
-        lines = self._Parse()
+        lines = []  # type: List[str]
+        while self.tok_id != Id.Eol_Tok:
+            self._ParseLine(lines)
 
         if self.tok_id != Id.Eol_Tok:
-            raise self._Error('Unexpected trailing input')
+            raise self._Error('Unexpected trailing input in J8 Lines')
+
         return lines
 
 
 def SplitJ8Lines(s):
     # type: (str) -> List[str]
     """Used by @(echo split command sub)
 
-    3 Errors
+    Raises:
+      error.Decode
 
-    - quotes don't match on a line
-    - J8 syntax error inside quotes
+    3 Errors:
+    - J8 string syntax error inside quotes
+    - Extra input on line
     - unquoted line isn't utf-8
-
-    Note that blank lines are IGNORED
-
-    Notes:
-    - This is related to TSV8?  Similar rules.  Does TSV8 have empty cells?
-      - It might have the - alias for empty cell?
-
-    Can we write this like a lexer, to preserve positions?
-
-    We just need a significant newline character, and then maybe an "outer"
-    lexer mode with \n and whitespace?
-
-    Or we can also just reuse the same LexerDecoder for each line?  Reset()?
-    Then we get the line number for free.
     """
     p = J8LinesParser(s)
-    # raises error.Decode
     return p.Parse()
 
+
 # vim: sw=4
diff --git a/frontend/lexer_def.py b/frontend/lexer_def.py
@@ -602,7 +602,7 @@ def R(pat, tok_type):
     R(r'[\n]', Id.Op_Newline),
 
     # not space or ' or " or EOF
-    R(r'''[^ \r\t'"\0]+''', Id.Lit_Chars),
+    R(r'''[^ \t\r\n'"\0]+''', Id.Lit_Chars),
 ]
 
 # Exclude control characters 0x00-0x1f, aka 0-31 in J8 data

diff --git a/osh/word_eval.py b/osh/word_eval.py
@@ -54,7 +54,7 @@
 from core import state
 from core import ui
 from core import util
-#from data_lang import j8
+from data_lang import j8
 from data_lang import j8_lite
 from core.error import e_die
 from frontend import consts
@@ -2304,8 +2304,13 @@ def _EvalCommandSub(self, cs_part, quoted):
 
         if cs_part.left_token.id == Id.Left_AtParen:
             # YSH splitting algorithm: does not depend on IFS
-            #strs = j8.SplitJ8Lines(stdout_str)
-            strs = self.splitter.SplitForWordEval(stdout_str)
+            try:
+                strs = j8.SplitJ8Lines(stdout_str)
+            except error.Decode as e:
+                # status code 4 is special, for encode/decode errors.
+                raise error.Structured(4, e.Message(), cs_part.left_token)
+
+            #strs = self.splitter.SplitForWordEval(stdout_str)
             return part_value.Array(strs)
         else:
             return Piece(stdout_str, quoted, not quoted)

diff --git a/spec/ysh-command-sub.test.sh b/spec/ysh-command-sub.test.sh
@@ -1,4 +1,4 @@
-## oils_failures_allowed: 1
+## oils_failures_allowed: 0
 
 #### Conflict with extglob @( can be avoided with ,(
 
@@ -39,13 +39,13 @@ x=axbxxc
 argv.py $x
 
 # This construct behaves the same with simple_word_eval on
+# Not affected by IFS
 echo --
 shopt -s simple_word_eval
 write -- @(seq 3)
 
-
 echo --
-write -- @(echo axbxxc)
+argv.py @(echo $x)
 
 ## STDOUT:
 1
@@ -58,38 +58,45 @@ write -- @(echo axbxxc)
 2
 3
 --
-a
-b
-
-c
+['axbxxc']
 ## END
 
 #### @() decodes J8 Lines
 
 # syntax errors - TODO: document this in doc/chap-errors
 
-# - quotes that don't match
-# - syntax error inside quotes
+# - syntax error in J8 string (bad escape, no closing quote)
+# - extra text after line
 # - unquoted line isn't valid UTF-8
 
 var b = @(
-  # spaces stripped here
-  echo "  unquoted ";
+  # unquoted: spaces stripped at beginning and end
+  echo '  unquoted "" word  '
+
+  # empty line is ignored
+  echo
 
   # Not allowed, since unquoted lines should be UTF-8
   #echo $'binary \xff';
 
-  echo '"json\n\u03bc"';
-  echo "u'j8 u \\u{3bc}'";
-  echo "b'j8 b \\yff'";
+  echo '  " json \u03bc " '
+  echo " u' j8 u \\u{3bc} ' "
+  echo " b' j8 b \\u{3bc} ' "
 
-  # no quotes is same as u''
-  echo "'j8 u \\u{3bc}'";
+  # no prefix is like u''
+  echo "  ' j8 no prefix \\u{3bc} ' "
 )
 
 json write (b)
 
 ## STDOUT:
+[
+  "unquoted \"\" word",
+  " json μ ",
+  " j8 u μ ",
+  " j8 b μ ",
+  " j8 no prefix μ "
+]
 ## END
 
 #### for loop using @(seq $n)

diff --git a/test/ysh-runtime-errors.sh b/test/ysh-runtime-errors.sh
@@ -917,23 +917,38 @@ pp line (d)
 }
 
 test-command-sub-j8() {
-  return
+  _ysh-should-run-here <<'EOF'
+write @(echo ' "json\tstring"  '; echo; echo " b'j8' "; echo ' unquoted ';)
+EOF
+  #return
 
   # quotes that don't match
-  _ysh-error-here-X 2 <<'EOF'
+  _ysh-error-here-X 4 <<'EOF'
 var lines = @(
-  echo "\"unbalanced"
+  echo '"unbalanced'
 )
 pp line (lines)
+EOF
+
+  # error in word language
+  _ysh-error-here-X 4 <<'EOF'
+write @(echo '"unbalanced')
+EOF
+
+  # can't have two strings on a line
+  _ysh-error-here-X 4 <<'EOF'
+write @(echo '"json" "nope"')
+EOF
+
+  _ysh-error-here-X 4 <<'EOF'
+write @(echo '"json" unquoted')
 EOF
 
   # syntax error inside quotes
-  _ysh-error-here-X 2 <<'EOF'
-var lines = @(
-  echo '"\"'
-)
-pp line (lines)
+  _ysh-error-here-X 4 <<'EOF'
+write @(echo '"hello \z"')
 EOF
+  return
 
   # unquoted line isn't valid UTF-8
   _ysh-error-here-X 2 <<'EOF'