Working on translating here docs to Oil.

- Test an invariant of the arena with here docs. - Make the here-dq.sh test pass by adding a line_span() instance for the ending delimiter. - here-sq.sh still doesn't pass. - Statically evaluate the here doc delimiter at the end of the line, rather when you encounter it. This makes it so there are fewer things in the HereDoc node. - Remove HereDoc.was_filled. Add here_end_span_id. - here-doc spec test: Clarify that multiple leading tabs are stripped. - Move some parse error bugs from cmd_parse_test.py to test/parse-errors.sh. This revealed a bug in osh -c 'cat <<EOF', which is now fixed.
oilshell · Aug 28, 2018 · 4cdba05 · 4cdba05
1 parent 92051ae
commit 4cdba05
Show file tree

Hide file tree

Showing 12 changed files with 178 additions and 57 deletions.
diff --git a/bin/oil.py b/bin/oil.py
@@ -184,8 +184,9 @@ def OshMain(argv0, argv, login_shell):
                 ['text', 'abbrev-text', 'html', 'abbrev-html', 'oheap', 'none'],
                 default='abbrev-text')
   spec.LongFlag('--show-ast')  # execute and show
-  spec.LongFlag('--fix')
+  spec.LongFlag('--fix')  # oshc translate
   spec.LongFlag('--debug-spans')  # For oshc translate
+  spec.LongFlag('--parse-and-print-arena')  # Invariant for translation
   spec.LongFlag('--print-status')
   spec.LongFlag('--trace', ['cmd-parse', 'word-parse', 'lexer'])  # NOTE: can only trace one now
   spec.LongFlag('--hijack-shebang')
@@ -346,9 +347,11 @@ def OshMain(argv0, argv, login_shell):
 
     do_exec = True
     if opts.fix:
-      #log('SPANS: %s', arena.spans)
       osh2oil.PrintAsOil(arena, node, opts.debug_spans)
       do_exec = False
+    if opts.parse_and_print_arena:
+      osh2oil.PrintArena(arena)
+      do_exec = False
     if exec_opts.noexec:
       do_exec = False
 

diff --git a/core/alloc.py b/core/alloc.py
@@ -107,6 +107,10 @@ def GetLineSpan(self, span_id):
       util.log('Span ID out of range: %d', span_id)
       raise
 
+  def LastSpanId(self):
+    """Return one past the last span ID."""
+    return len(self.spans)
+
   def GetDebugInfo(self, line_id):
     """Get the path and physical line number, for parse errors."""
     assert line_id != const.NO_INTEGER, line_id

diff --git a/osh/cmd_parse.py b/osh/cmd_parse.py
@@ -82,14 +82,25 @@ def GetCompletionState(self):
 
   def _MaybeReadHereDocs(self):
     for h in self.pending_here_docs:
+      here_end_line = None
+      here_end_line_id = -1
       lines = []
+
+      # "If any character in word is quoted, the delimiter shall be formed by
+      # performing quote removal on word, and the here-document lines shall not
+      # be expanded. Otherwise, the delimiter shall be the word itself."
+      # NOTE: \EOF counts, or even E\OF
+      ok, delimiter, quoted = word.StaticEval(h.here_begin)
+      if not ok:
+        p_die('Invalid here doc delimiter', word=h.here_begin)
+      do_expansion = not quoted
+
       #log('HERE %r' % h.here_end)
       while True:
         # If op is <<-, strip off all leading tabs (NOT spaces).
         # (in C++, just bump the start?)
         line_id, line = self.line_reader.GetLine()
 
-        #print("LINE %r %r" % (line, h.here_end))
         if not line:  # EOF
           # An unterminated here doc is just a warning in bash.  We make it
           # fatal because we want to be strict, and because it causes problems
@@ -100,40 +111,51 @@ def _MaybeReadHereDocs(self):
 
         # NOTE: Could do this runtime to preserve LST.
         if h.op.id == Id.Redir_DLessDash:
+          # NOTE: Stripping multiple leading tabs is correct!
           line = line.lstrip('\t')
-        if line.rstrip() == h.here_end:
+        if line.rstrip() == delimiter:
+          here_end_line = line
+          here_end_line_id = line_id
           break
 
         lines.append((line_id, line))
 
       parts = []
-      if h.do_expansion:
+      if do_expansion:
         # NOTE: We read all lines at once, instead of doing it line-by-line,
         # because of cases like this:
         # cat <<EOF
         # 1 $(echo 2
         # echo 3) 4
         # EOF
 
+        # NOTE: How to assign spids for these lines?
+        # VirtualLineReader needs to pick up the tokens somehow?
+
+        # self.arena.AddLineSpan() is the thing that assigns IDs.
+        # So here you just need to call self.AddLineSpan()!
+
         from osh import parse_lib  # Avoid circular import
         w_parser = parse_lib.MakeWordParserForHereDoc(lines, self.arena)
 
         # NOTE: There can be different kinds of parse errors in here docs.
-        word = w_parser.ReadHereDocBody()
-        assert word is not None
-        h.body = word
-        h.was_filled = True
+        w = w_parser.ReadHereDocBody()
+        assert w is not None
+        h.body = w
       else:
         # Each line is a single span.  TODO: Add span_id to token.
         tokens = [
             ast.token(Id.Lit_Chars, line, const.NO_INTEGER)
             for _, line in lines]
         parts = [ast.LiteralPart(t) for t in tokens]
         h.body = ast.CompoundWord(parts)
-        h.was_filled = True
 
-    # No .clear() until Python 3.3.
-    del self.pending_here_docs[:]
+      # Create a span with the end terminator.  Maintains the invariant that
+      # the spans "add up".
+      line_span = ast.line_span(here_end_line_id, 0, len(here_end_line))
+      unused_spid = self.arena.AddLineSpan(line_span)
+
+    del self.pending_here_docs[:]  # No .clear() until Python 3.3.
 
     return True
 
@@ -225,19 +247,10 @@ def ParseRedirect(self):
       node.op = op
       node.body = None  # not read yet
       node.fd = fd
-      node.was_filled = False
       self._Next()
 
       if not self._Peek(): return None
       node.here_begin = self.cur_word
-      # "If any character in word is quoted, the delimiter shall be formed by
-      # performing quote removal on word, and the here-document lines shall not
-      # be expanded. Otherwise, the delimiter shall be the word itself."
-      # NOTE: \EOF counts, or even E\OF
-      ok, node.here_end, quoted = word.StaticEval(node.here_begin)
-      if not ok:
-        p_die('Invalid here doc delimiter', word=node.here_begin)
-      node.do_expansion = not quoted
       self._Next()
 
       self.pending_here_docs.append(node)  # will be filled on next newline.
@@ -1500,4 +1513,10 @@ def ParseWholeFile(self):
     assert node is not None
     assert node is not False
 
+    # NOTE: This happens when there is no newline at the end of a file, like
+    # osh -c 'cat <<EOF'
+    if self.pending_here_docs:
+      node = self.pending_here_docs[0]  # Just show the first one?
+      p_die('Unterminated here doc began here', word=node.here_begin)
+
     return node
diff --git a/osh/cmd_parse_test.py b/osh/cmd_parse_test.py
@@ -228,7 +228,6 @@ def testUnquotedHereDoc(self):
     self.assertTrue(isinstance(dq, ast.DoubleQuotedPart))
     # 4 literal parts: VarSub, newline, right ", "two\n"
     self.assertEqual(4, len(dq.parts))
-    self.assertEqual(True, h.do_expansion)
 
   def testQuotedHereDocs(self):
     # Quoted here doc
@@ -241,7 +240,6 @@ def testQuotedHereDocs(self):
     self.assertEqual(1, len(node.redirects))
     h = node.redirects[0]
     self.assertEqual(2, len(h.body.parts))  # 2 literal parts
-    self.assertEqual(False, h.do_expansion)
 
     node = assertParseCommandLine(self, """\
 cat <<'EOF'
@@ -251,7 +249,6 @@ def testQuotedHereDocs(self):
     self.assertEqual(1, len(node.redirects))
     h = node.redirects[0]
     self.assertEqual(1, len(h.body.parts))  # 1 line, one literal part
-    self.assertEqual(False, h.do_expansion)
 
     # \ escape
     node = assertParseCommandLine(self, r"""\
@@ -262,7 +259,6 @@ def testQuotedHereDocs(self):
     self.assertEqual(1, len(node.redirects))
     h = node.redirects[0]
     self.assertEqual(1, len(h.body.parts))  # 1 line, one literal part
-    self.assertEqual(False, h.do_expansion)
 
   def testLeadingTabs(self):
     node = assertParseCommandLine(self, """\
@@ -1200,21 +1196,6 @@ def testCommand(self):
 
     err = _assertParseCommandListError(self, 'ls < <')
 
-    # Invalid words as here docs
-    err = _assertParseCommandListError(self, 'cat << $(invalid here end)')
-
-    # TODO: Arith parser doesn't have location information
-    err = _assertParseCommandListError(self, 'cat << $((1+2))')
-    err = _assertParseCommandListError(self, 'cat << a=(1 2 3)')
-    err = _assertParseCommandListError(self, r'cat << \a$(invalid)')
-
-    # Actually the $invalid part should be highlighted... yeah an individual
-    # part is the problem.
-    err = _assertParseCommandListError(self, r"cat << 'single'$(invalid)")
-    err = _assertParseCommandListError(self, r'cat << "double"$(invalid)')
-    err = _assertParseCommandListError(self, r'cat << ~foo/$(invalid)')
-    err = _assertParseCommandListError(self, r'cat << $var/$(invalid)')
-
     # Word parse error in command parser
     err = _assertParseCommandListError(self, r'echo foo$(ls <)bar')
 

diff --git a/osh/osh.asdl b/osh/osh.asdl
@@ -135,16 +135,14 @@ module osh
   --   pass could StaticEval it to a string and set do_expansion.
   -- * To reprint the here doc, we need the here_end delimiter, but it doesn't
   --   matter at runtime.  do_expansion is calculated from it.
-  -- * was_filled is only used during the parse and should be eliminated from
-  --   serialization format.
   -- TODO : id -> token for translation?
 
   redir = 
     Redir(token op, int fd, word arg_word)
   | HereDoc(token op, int fd,
-            word here_begin,  -- For translation
-            string here_end, bool do_expansion, -- Derived from here_begin
-            word? body, bool was_filled)
+            word here_begin,  -- e.g. EOF or 'EOF'
+            int here_end_span_id,  -- this span is an entire line
+            word? body)
 
   assign_op = Equal | PlusEqual
   assign_pair = (lhs_expr lhs, assign_op op, word? rhs)

diff --git a/osh/word_parse.py b/osh/word_parse.py
@@ -642,6 +642,7 @@ def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
     left_spid = const.NO_INTEGER  # gets set later
     right_spid = const.NO_INTEGER  # gets set later
 
+    # TODO: Use here doc.
     if self.cur_token is not None:  # None in here doc case
       left_token = self.cur_token
       left_spid = left_token.span_id
@@ -682,7 +683,7 @@ def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
           quoted_part.parts.append(ast.LiteralPart(self.cur_token))
         else:
           done = True  # assume Id.Right_DoubleQuote
-          right_spid = self.cur_token.span_id
+        right_spid = self.cur_token.span_id
 
       elif self.token_kind == Kind.Eof:
         if here_doc:  # here docs will have an EOF in their token stream
@@ -1203,8 +1204,13 @@ def ReadHereDocBody(self):
       CompoundWord.  NOTE: We could also just use a DoubleQuotedPart for both
       cases?
     """
-    w = ast.CompoundWord()
     dq = self._ReadDoubleQuotedPart(here_doc=True)
     assert dq is not None
-    w.parts.append(dq)
-    return w
+    return ast.CompoundWord([dq])
+
+  # TODO: _ReadDQContext(parts) should be shared between
+  # _ReadDoubleQuotedPart() and ReadHereDocBody()
+  # Call with dq_part.parts
+  # and here_doc_node.body
+
+
diff --git a/spec/here-doc.test.sh b/spec/here-doc.test.sh
@@ -295,12 +295,14 @@ EOF
 cat <<-EOF
 	1
 	2
-  3
+		3  # 2 tabs are both stripped
+  4  # spaces are preserved
 EOF
 ## STDOUT:
 1
 2
-  3
+3  # 2 tabs are both stripped
+  4  # spaces are preserved
 ## END
 
 #### Here doc within subshell with boolean

diff --git a/test/arena.sh b/test/arena.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Usage:
+#   ./arena.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+source test/common.sh
+
+compare() {
+  local path=$1
+
+  mkdir -p _tmp/arena
+  bin/osh --parse-and-print-arena $path > _tmp/arena/left.txt
+  diff -u $path _tmp/arena/left.txt
+}
+
+here-doc() {
+  compare test/arena/here-dq.sh
+  compare test/arena/here-sq.sh
+}
+
+readonly -a PASSING=(
+  here-doc
+)
+
+all-passing() {
+  run-all "${PASSING[@]}"
+}
+
+run-for-release() {
+  local out_dir=_tmp/arena
+  mkdir -p $out_dir
+
+  all-passing | tee $out_dir/log.txt
+
+  echo "Wrote $out_dir/log.txt"
+}
+
+"$@"
diff --git a/test/arena/here-dq.sh b/test/arena/here-dq.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+echo "DQ"
+
+cat <<EOF
+here
+doc $var
+EOF 
+echo --
diff --git a/test/arena/here-sq.sh b/test/arena/here-sq.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+echo 'SQ'
+
+cat <<'EOF'
+here
+doc $var
+EOF
+echo --
diff --git a/test/parse-errors.sh b/test/parse-errors.sh
@@ -282,13 +282,33 @@ EOF
 '
 }
 
+here-doc-delimiter() {
+  set +o errexit
+
+  # NOTE: This is more like the case where.
+  _error-case 'cat << $(invalid here end)'
+
+  # TODO: Arith parser doesn't have location information
+  _error-case 'cat << $((1+2))'
+  _error-case 'cat << a=(1 2 3)'
+  _error-case 'cat << \a$(invalid)'
+
+  # Actually the $invalid part should be highlighted... yeah an individual
+  # part is the problem.
+  #"cat << 'single'$(invalid)"
+  _error-case 'cat << "double"$(invalid)'
+  _error-case 'cat << ~foo/$(invalid)'
+  _error-case 'cat << $var/$(invalid)'
+}
+
 cases-in-strings() {
   set +o errexit
 
   cmd-parse
   simple-command
   redirect
   here-doc
+  here-doc-delimiter
 
   # Word
   word-parse