[ysh] Preparing for @() to decode J8 lines

Add some tests - not hooked up yet. Fix J8 lexer error message.
oilshell · Apr 17, 2024 · 04d9509 · 04d9509
1 parent 9152b7e
commit 04d9509
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 20 deletions.
diff --git a/data_lang/j8.py b/data_lang/j8.py
@@ -543,11 +543,11 @@ class LexerDecoder(object):
     string
     """
 
-    def __init__(self, s, is_j8):
-        # type: (str, bool) -> None
+    def __init__(self, s, is_j8, lang_str):
+        # type: (str, bool, str) -> None
         self.s = s
         self.is_j8 = is_j8
-        self.lang_str = "NIL8"
+        self.lang_str = lang_str
 
         self.pos = 0
         # Reuse this instance to save GC objects.  JSON objects could have
@@ -698,7 +698,7 @@ def __init__(self, s, is_j8):
         self.is_j8 = is_j8
         self.lang_str = "J8" if is_j8 else "JSON"
 
-        self.lexer = LexerDecoder(s, is_j8)
+        self.lexer = LexerDecoder(s, is_j8, self.lang_str)
         self.tok_id = Id.Undefined_Tok
         self.start_pos = 0
         self.end_pos = 0
@@ -1050,4 +1050,62 @@ def ParseNil8(self):
         return obj
 
 
+# types of lines
+UNQUOTED = 0
+SINGLE = 1
+DOUBLE = 2
+
+def SplitJ8Lines(s):
+    # type: (str) -> List[str]
+    """Used by @(echo split command sub)
+
+    3 Errors
+
+    - quotes don't match on a line
+    - J8 syntax error inside quotes
+    - unquoted line isn't utf-8
+
+    Note that blank lines are IGNORED
+
+    Notes:
+    - This is related to TSV8?  Similar rules.  Does TSV8 have empty cells?
+      - It might have the - alias for empty cell?
+    """
+    lines = s.split('\n')
+    strs = []  # type: List[str]
+    for line in lines:
+        # strip leading and trailing whitespace, no matter what
+        line = line.strip()
+
+        if len(line) == 0:
+            # always skip blank lines - empty strings should be "" or ''
+            continue
+
+        left_type = UNQUOTED
+        if (line.startswith("u'") or line.startswith("b'") or line.startswith("'")):
+            left_type = SINGLE
+        elif line.startswith('"'):
+            left_type = DOUBLE
+
+        right_type = UNQUOTED
+        if line.endswith("'"):
+            right_type = SINGLE
+        elif line.endswith('"'):
+            right_type = DOUBLE
+
+        if left_type != right_type:
+            # TODO: position
+            raise error.Decode("Mismatched quotes in J8 Lines", line, 0, 0)
+
+        if left_type == UNQUOTED:
+            # TODO: validate UTF-8
+            out_str = line
+        else:
+            # Decode j8
+            out_str = line
+        strs.append(out_str)
+
+    return strs
+
+
 # vim: sw=4
diff --git a/data_lang/j8_test.py b/data_lang/j8_test.py
@@ -149,37 +149,37 @@ def testJ8(self):
         print(obj)
 
     def testLexerDecoder(self):
-        lex = j8.LexerDecoder(r'{"hi": "bye \n"}', True)
+        lex = j8.LexerDecoder(r'{"hi": "bye \n"}', True, 'J8')
         _PrintTokens(lex)
 
-        lex = j8.LexerDecoder(r"{u'unicode': b'bytes \y1f \yff'}", True)
+        lex = j8.LexerDecoder(r"{u'unicode': b'bytes \y1f \yff'}", True, 'J8')
         _PrintTokens(lex)
 
         lex = j8.LexerDecoder(
-            r'{"mu \u03BC \u0001":' + r"b'mu \u{03bc} \u{2620}'", True)
+            r'{"mu \u03BC \u0001":' + r"b'mu \u{03bc} \u{2620}'", True, 'J8')
         _PrintTokens(lex)
 
-        lex = j8.LexerDecoder(r'{"x": [1, 2, 3.14, true]}', True)
+        lex = j8.LexerDecoder(r'{"x": [1, 2, 3.14, true]}', True, 'J8')
         _PrintTokens(lex)
 
         lex = j8.LexerDecoder(
             r'''
         [
           1e9, 1e-9, -1e9, -1E-9, 42
         ]
-        ''', True)
+        ''', True, 'J8')
         _PrintTokens(lex)
 
         try:
-            lex = j8.LexerDecoder('"\x01"', True)
+            lex = j8.LexerDecoder('"\x01"', True, 'J8')
             _PrintTokens(lex)
         except error.Decode as e:
             print(e)
         else:
             self.fail('Expected failure')
 
         try:
-            lex = j8.LexerDecoder('"\x1f"', True)
+            lex = j8.LexerDecoder('"\x1f"', True, 'J8')
             _PrintTokens(lex)
         except error.Decode as e:
             print(e)
@@ -197,11 +197,11 @@ def testMoreTokens(self):
             '(Node left:(-> 123))',
         ]
         for s in cases:
-            lex = j8.LexerDecoder(s, True)
+            lex = j8.LexerDecoder(s, True, 'J8')
             _PrintTokens(lex)
 
     def testErrorMessagePosition(self):
-        lex = j8.LexerDecoder("[ u'hi']", False)
+        lex = j8.LexerDecoder("[ u'hi']", False, 'J8')
         try:
             _PrintTokens(lex)
         except error.Decode as e:

diff --git a/doc/io-builtins.md b/doc/io-builtins.md
@@ -45,7 +45,7 @@ These are discussed in more detail the [strings](strings.html) doc.
 
 Example:
 
-    hostname | read --all :x
+    hostname | read --all (&x)
     write -- $x
 
 ## Summary of YSH features

diff --git a/osh/split.py b/osh/split.py
@@ -179,7 +179,7 @@ def SplitForWordEval(self, s, ifs=None):
         # type: (str, Optional[str]) -> List[str]
         """Split used by word evaluation.
 
-        Also used by the explicit @split() function.
+        Also used by the explicit shSplit() function.
         """
         sp = self._GetSplitter(ifs=ifs)
         spans = sp.Split(s, True)

diff --git a/osh/word_eval.py b/osh/word_eval.py
@@ -54,6 +54,7 @@
 from core import state
 from core import ui
 from core import util
+from data_lang import j8
 from data_lang import j8_lite
 from core.error import e_die
 from frontend import consts
@@ -2300,7 +2301,10 @@ def CheckCircularDeps(self):
     def _EvalCommandSub(self, cs_part, quoted):
         # type: (CommandSub, bool) -> part_value_t
         stdout_str = self.shell_ex.RunCommandSub(cs_part)
+
         if cs_part.left_token.id == Id.Left_AtParen:
+            # YSH splitting algorithm: does not depend on IFS
+            #strs = j8.SplitJ8Lines(stdout_str)
             strs = self.splitter.SplitForWordEval(stdout_str)
             return part_value.Array(strs)
         else:

diff --git a/spec/ysh-command-sub.test.sh b/spec/ysh-command-sub.test.sh
@@ -66,16 +66,28 @@ c
 
 #### @() decodes J8 Lines
 
+# syntax errors - TODO: document this in doc/chap-errors
+
+# - quotes that don't match
+# - syntax error inside quotes
+# - unquoted line isn't valid UTF-8
+
 var b = @(
+  # spaces stripped here
   echo "  unquoted ";
-  # I guess this is allowed
-  echo $'binary \xff';
+
+  # Not allowed, since unquoted lines should be UTF-8
+  #echo $'binary \xff';
+
   echo '"json\n\u03bc"';
   echo "u'j8 u \\u{3bc}'";
-  echo "b'j8 b \\y{ff'";
+  echo "b'j8 b \\yff'";
+
+  # no quotes is same as u''
+  echo "'j8 u \\u{3bc}'";
 )
 
-pp line (b)
+json write (b)
 
 ## STDOUT:
 ## END

diff --git a/test/ysh-runtime-errors.sh b/test/ysh-runtime-errors.sh
@@ -916,6 +916,35 @@ pp line (d)
    '
 }
 
+test-command-sub-j8() {
+  return
+
+  # quotes that don't match
+  _ysh-error-here-X 2 <<'EOF'
+var lines = @(
+  echo "\"unbalanced"
+)
+pp line (lines)
+EOF
+
+  # syntax error inside quotes
+  _ysh-error-here-X 2 <<'EOF'
+var lines = @(
+  echo '"\"'
+)
+pp line (lines)
+EOF
+
+  # unquoted line isn't valid UTF-8
+  _ysh-error-here-X 2 <<'EOF'
+var lines = @(
+  echo $'\xff'
+)
+pp line (lines)
+EOF
+
+}
+
 soil-run-py() {
   run-test-funcs
 }

diff --git a/ysh/expr_eval.py b/ysh/expr_eval.py
@@ -53,6 +53,7 @@
 from core import state
 from core import ui
 from core import vm
+from data_lang import j8
 from frontend import lexer
 from frontend import match
 from frontend import typed_args
@@ -1027,8 +1028,11 @@ def _EvalExpr(self, node):
                 else:
                     stdout_str = self.shell_ex.RunCommandSub(node)
                     if id_ == Id.Left_AtParen:  # @(seq 3)
-                        # TODO: Should use J8 lines
+                        # YSH splitting algorithm: does not depend on IFS
+                        #strs = j8.SplitJ8Lines(stdout_str)
+
                         strs = self.splitter.SplitForWordEval(stdout_str)
+
                         items = [value.Str(s)
                                  for s in strs]  # type: List[value_t]
                         return value.List(items)