From da4a6868daaa3c4dc63620568690cdf58c2aacb3 Mon Sep 17 00:00:00 2001 From: Andy Chu Date: Tue, 28 Aug 2018 09:31:25 -0700 Subject: [PATCH] Fix LST invariant when using <<- in the single-quoted context. Found this bug by testing the arena invariant over the 'wild' corpus. TODO: Still have a bug in the double-quoted context. - test/osh2oil.sh here-doc has a failing case for this - test/arena.sh here-doc also has a failing case --- core/reader.py | 6 +++--- core/reader_test.py | 4 ++-- osh/cmd_parse.py | 34 ++++++++++++++++++++++++---------- osh/osh.asdl | 15 ++++++++++----- osh/parse_lib.py | 3 +-- test/arena.sh | 34 +++++++++++++++++++++++++++------- test/arena/here-dq-indented.sh | 7 +++++++ test/osh2oil.sh | 24 +++++++++++++++++++----- 8 files changed, 93 insertions(+), 34 deletions(-) create mode 100644 test/arena/here-dq-indented.sh diff --git a/core/reader.py b/core/reader.py index c82614fb2b..efe80e7597 100644 --- a/core/reader.py +++ b/core/reader.py @@ -95,7 +95,7 @@ def StringLineReader(s, arena): # internally. -class VirtualLineReader(_Reader): +class HereDocLineReader(_Reader): """Used for here docs.""" def __init__(self, lines, arena): """ @@ -110,7 +110,7 @@ def __init__(self, lines, arena): def GetLine(self): if self.pos == self.num_lines: return -1, None - line_id, line = self.lines[self.pos] + line_id, line, start_offset = self.lines[self.pos] self.pos += 1 - return line_id, line + return line_id, line[start_offset:] diff --git a/core/reader_test.py b/core/reader_test.py index 85640334fc..bc13336f98 100755 --- a/core/reader_test.py +++ b/core/reader_test.py @@ -34,8 +34,8 @@ def testLineReadersAreEquivalent(self): r2 = reader.FileLineReader(f, a2) a3 = self.pool.NewArena() - lines = [(0, 'one\n'), (1, 'two')] - r3 = reader.VirtualLineReader(lines, a3) + lines = [(0, 'one\n', 0), (1, 'two', 0)] + r3 = reader.HereDocLineReader(lines, a3) for a in [a1, a2, a3]: a.PushSource('reader_test.py') diff --git a/osh/cmd_parse.py b/osh/cmd_parse.py index c3e310bd0e..2e32f5089f 100644 --- a/osh/cmd_parse.py +++ b/osh/cmd_parse.py @@ -13,6 +13,7 @@ from asdl import const from core import braces +from core import reader from core import word from core import util @@ -86,7 +87,8 @@ def _MaybeReadHereDocs(self): for h in self.pending_here_docs: here_end_line = None here_end_line_id = -1 - lines = [] + here_end_start_offset = 0 + here_lines = [] # "If any character in word is quoted, the delimiter shall be formed by # performing quote removal on word, and the here-document lines shall not @@ -115,37 +117,49 @@ def _MaybeReadHereDocs(self): # If op is <<-, strip off ALL leading tabs -- not spaces, and not just # the first tab. + start_offset = 0 if h.op.id == Id.Redir_DLessDash: - line = line.lstrip('\t') - if line.rstrip() == delimiter: - here_end_line = line + n = len(line) + i = 0 + while i < n: + if line[i] != '\t': + break + i += 1 + start_offset = i + + if line[start_offset:].rstrip() == delimiter: here_end_line_id = line_id + here_end_line = line + here_end_start_offset = start_offset break - lines.append((line_id, line)) + here_lines.append((line_id, line, start_offset)) + + line_reader = reader.HereDocLineReader(here_lines, self.arena) parts = [] if delim_quoted: # << 'EOF' # Create a line_span and a token for each line. tokens = [] - for line_id, line in lines: - line_span = ast.line_span(line_id, 0, len(line)) + for line_id, line, start_offset in here_lines: + line_span = ast.line_span(line_id, start_offset, len(line)) span_id = self.arena.AddLineSpan(line_span) - t = ast.token(Id.Lit_Chars, line, span_id) + t = ast.token(Id.Lit_Chars, line[start_offset:], span_id) tokens.append(t) # LiteralPart for each line. h.stdin_parts = [ast.LiteralPart(t) for t in tokens] else: from osh import parse_lib # Avoid circular import - w_parser = parse_lib.MakeWordParserForHereDoc(lines, self.arena) + w_parser = parse_lib.MakeWordParserForHereDoc(line_reader, self.arena) # NOTE: There can be different kinds of parse errors in here docs. w = w_parser.ReadHereDocBody(h.stdin_parts) # Create a span with the end terminator. Maintains the invariant that # the spans "add up". - line_span = ast.line_span(here_end_line_id, 0, len(here_end_line)) + line_span = ast.line_span( + here_end_line_id, here_end_start_offset, len(here_end_line)) h.here_end_span_id = self.arena.AddLineSpan(line_span) del self.pending_here_docs[:] # No .clear() until Python 3.3. diff --git a/osh/osh.asdl b/osh/osh.asdl index b0b675cf4a..c468d6fb21 100644 --- a/osh/osh.asdl +++ b/osh/osh.asdl @@ -2,7 +2,13 @@ -- -- Invariant: the source text can be reconstructed byte-for-byte from this -- tree. --- +-- +-- Exceptions: +-- * <<- here docs with leading tabs, since we don't want those for +-- conversion. We don't want files with mixed tabs and spaces. +-- * Found to be not strictly necessary for oil conversion +-- * foo() { } vs function foo { } -- ksh +-- -- The AST is composed of the builtin ASDL types (string, int, bool) and our -- application type 'id', which is core.id_kind.Id. @@ -28,9 +34,8 @@ -- * 1 + 2*3 vs. 1 + (2*3) or even 1 + ((2*3)) -- * [[ (1 == 1) ]] vs [[ 1 == 1 ]] -- * $(( 1 + 2 )) vs $[1 + 2] (bash-specific, used by Aboriginal) --- --- Found to be not strictly necessary for oil conversion --- * foo() { } vs function foo { } -- ksh +-- TODO: +-- * remove redundancy of 'string val' in token. It should use the span. module osh { @@ -38,7 +43,7 @@ module osh -- TODO: Need arena_id, for different files line_span = (int line_id, int col, int length) - -- A primitive token. NOTE: val is redundant with 'loc' for now. If we + -- A primitive token. NOTE: val is redundant with 'span_id' for now. If we -- rewrite the parser in C++, we might care for memory footprint. But for -- now this is convenient. -- NOTE: identical strings can shared, if we care. diff --git a/osh/parse_lib.py b/osh/parse_lib.py index 81ffd936b4..426a8a5c46 100644 --- a/osh/parse_lib.py +++ b/osh/parse_lib.py @@ -64,8 +64,7 @@ def MakeParserForCompletion(code_str, arena): return w_parser, c_parser -def MakeWordParserForHereDoc(lines, arena): - line_reader = reader.VirtualLineReader(lines, arena) +def MakeWordParserForHereDoc(line_reader, arena): line_lexer = lexer.LineLexer(match.MATCHER, '', arena) lx = lexer.Lexer(line_lexer, line_reader) return word_parse.WordParser(lx, line_reader) diff --git a/test/arena.sh b/test/arena.sh index a868421621..839b93d836 100755 --- a/test/arena.sh +++ b/test/arena.sh @@ -7,24 +7,44 @@ set -o nounset set -o pipefail set -o errexit -source test/common.sh +# TODO: Need include guard in test/common.sh! I ran into this before. +#source test/common.sh # for all-passing, etc. +source test/wild-runner.sh # For MANIFEST, etc. -compare() { +_compare() { local path=$1 mkdir -p _tmp/arena bin/osh --parse-and-print-arena $path > _tmp/arena/left.txt - diff -u $path _tmp/arena/left.txt + if diff -u $path _tmp/arena/left.txt; then + echo "$path" + else + return 1 + fi } here-doc() { - compare test/arena/here-dq.sh - compare test/arena/here-sq.sh - compare test/arena/here-multiple.sh + _compare test/arena/here-dq.sh + _compare test/arena/here-sq.sh + _compare test/arena/here-multiple.sh + _compare test/arena/here-dq-indented.sh } tilde() { - compare test/arena/tilde.sh + _compare test/arena/tilde.sh +} + +_compare-wild() { + local rel_path=$1 + local abs_path=$2 + + _compare $abs_path +} + +# Run on wild corpus +wild() { + wc -l $MANIFEST + cat $MANIFEST | xargs -n 2 -- $0 _compare-wild } readonly -a PASSING=( diff --git a/test/arena/here-dq-indented.sh b/test/arena/here-dq-indented.sh new file mode 100644 index 0000000000..0a8d58122a --- /dev/null +++ b/test/arena/here-dq-indented.sh @@ -0,0 +1,7 @@ +#!/bin/bash + + cat <<-ONE + indented + body + ONE +OSH diff --git a/test/osh2oil.sh b/test/osh2oil.sh index bdb08d3d3d..2eaab53a7d 100755 --- a/test/osh2oil.sh +++ b/test/osh2oil.sh @@ -300,6 +300,7 @@ OIL } here-doc() { + # DQ context osh0-oil3 << 'OSH' 3<< 'OIL' cat <