Permalink
Browse files

Word eval and the read builtin now use the new IFS splitting algorithm.

It took a couple tries to get the concepts right, but this fixes even
more bugs in word splitting.

The read builtin now fully supports backslash escaping (without -r).

Dozens of more spec tests pass, e.g. in
spec/{word-split,var-sub-quote,word-eval,builtin-io}.test.sh.

All unit tests pass.

Details:

- Change the representation of evaluated word parts.  Add
  part_value.CompoundPartValue because it is logically a tree.
  TODO: We can flatten this on the fly by using the recursive
  accumulator style.

- Stop using WhitespaceSplitter.  A Single IfsSplitter works better,
  because both have to handle backslash escaping.

- 'read' uses a new _AppendParts() function which handles backslash
  escapes and line continuation.
  - Fix a bug where REPLY was not respected, and add test.

- Fix a bug with echo -e "\\", and add tests.  The Python regex wasn't
  properly escaped!

- Use runtime.asdl for Span so we debug with pretty-printing.
- asdl/format.py: Change pretty-printing of ASDL bools.
- Remove old regex-based IFS splitting code.
- Add a text file on word evaluation.

NOTE: we have to escape inside out!

1. maybe GlobEscape
2. maybe IFS escape
3. split with IFS
4. glob
  • Loading branch information...
Andy Chu
Andy Chu committed Jan 4, 2018
1 parent 7192efd commit a0347512d9684358bd11a45f8f5d64ce0c061417
Showing with 844 additions and 456 deletions.
  1. +4 −1 asdl/format.py
  2. +83 −31 core/builtin.py
  3. +9 −29 core/builtin_test.py
  4. +1 −1 core/glob_.py
  5. +89 −155 core/legacy.py
  6. +72 −44 core/legacy_test.py
  7. +8 −12 core/runtime.asdl
  8. +201 −171 core/word_eval.py
  9. +270 −0 doc/word-eval.txt
  10. +63 −4 spec/builtin-io.test.sh
  11. +28 −0 spec/var-sub-quote.test.sh
  12. +12 −4 spec/word-split.test.sh
  13. +4 −4 test/spec.sh
View
@@ -247,9 +247,12 @@ def MakeFieldSubtree(obj, field_name, desc, abbrev_hook, omit_empty=True):
raise AssertionError(
'%s is missing field %r' % (obj.__class__, field_name))
if isinstance(desc, asdl.IntType) or isinstance(desc, asdl.BoolType):
if isinstance(desc, asdl.IntType):
out_val = _ColoredString(str(field_val), _OTHER_LITERAL)
elif isinstance(desc, asdl.BoolType):
out_val = _ColoredString('T' if field_val else 'F', _OTHER_LITERAL)
elif isinstance(desc, asdl.Sum) and asdl.is_simple(desc):
out_val = field_val.name
View
@@ -43,6 +43,7 @@
value_e = runtime.value_e
scope_e = runtime.scope_e
span_e = runtime.span_e
var_flags_e = runtime.var_flags_e
log = util.log
e_die = util.e_die
@@ -229,7 +230,7 @@ def Resolve(argv0):
# - the DOLLAR_SQ lex state needs most of this logic.
ECHO_LEXER = lexer.SimpleLexer([
R(r'\\[0abeEfrtnv]', Id.Char_OneChar),
R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
C(r'\c', Id.Char_Stop),
# Note: tokens above \0377 can either be truncated or be flagged a syntax
@@ -245,7 +246,7 @@ def Resolve(argv0):
R(r'\\.', Id.Char_Literals),
# Backslash that ends the string.
R('\\$', Id.Char_Literals),
R(r'\\$', Id.Char_Literals),
# For re2c. TODO: need to make that translation.
C('\\\0', Id.Char_Literals),
])
@@ -262,6 +263,7 @@ def Resolve(argv0):
'r': '\r',
't': '\t',
'v': '\v',
'\\': '\\',
}
@@ -516,6 +518,58 @@ def Jobs(argv, job_state):
# 1. Process backslashes (or don't if it's -r)
# 2. Split.
def _AppendParts(s, spans, max_results, join_next, parts):
"""
Args:
s: The original string
spans: List of (span, end_index)
max_results: the maximum number of parts we want
join_next: Whether to join the next span to the previous part. This
happens in two cases:
- when we have '\ '
- and when we have more spans # than max_results.
"""
start_index = 0
# If the last span was black, and we get a backslash, set join_next to merge
# two black spans.
last_span_was_black = False
for span_type, end_index in spans:
if span_type == span_e.Black:
if join_next and parts:
parts[-1] += s[start_index:end_index]
join_next = False
else:
parts.append(s[start_index:end_index])
last_span_was_black = True
elif span_type == span_e.Delim:
if join_next:
parts[-1] += s[start_index:end_index]
join_next = False
last_span_was_black = False
elif span_type == span_e.Backslash:
if last_span_was_black:
join_next = True
last_span_was_black = False
if len(parts) >= max_results:
join_next = True
start_index = end_index
done = True
if spans:
#log('%s %s', s, spans)
#log('%s', spans[-1])
last_span_type, _ = spans[-1]
if last_span_type == span_e.Backslash:
done = False
#log('PARTS %s', parts)
return done, join_next
READ_SPEC = _Register('read')
READ_SPEC.ShortFlag('-r')
@@ -525,9 +579,6 @@ def Jobs(argv, job_state):
def Read(argv, splitter, mem):
arg, i = READ_SPEC.Parse(argv)
if not arg.r:
util.warn('*** read without -r not implemented ***')
names = argv[i:]
if arg.n is not None: # read a certain number of bytes
try:
@@ -541,40 +592,41 @@ def Read(argv, splitter, mem):
# NOTE: Even if we don't get n bytes back, there is no error?
return 0
# We have to read more than one line if there is a line continuation (and
# it's not -r).
line = sys.stdin.readline()
if not line: # EOF
return 1
if line.endswith('\n'): # strip trailing newline
line = line[:-1]
status = 0
else:
# odd bash behavior: fail even if we can set variables.
status = 1
if not names:
names.append('REPLY')
# leftover words assigned to the last name
n = len(names)
max_results = len(names)
# We have to read more than one line if there is a line continuation (and
# it's not -r).
#ifs = legacy.GetIfs(mem)
parts = splitter.SplitForRead(line, not arg.r)
parts = []
join_next = False
while True:
line = sys.stdin.readline()
#log('LINE %r', line)
if not line: # EOF
status = 1
break
# If the last part is ignored and it consists of a single \, then we need to
# read another line! And we need to JOIN the other non-ignored segments on
# adjacent lines!!!
if line.endswith('\n'): # strip trailing newline
line = line[:-1]
status = 0
else:
# odd bash behavior: fail even if we can set variables.
status = 1
continued = False
#log('split: %s %s', parts, continued)
#
spans = splitter.SplitForRead(line, not arg.r)
done, join_next = _AppendParts(line, spans, max_results, join_next, parts)
# TODO: replace this with the above
strs = line.split(None, n-1)
#log('PARTS %s continued %s', parts, continued)
if done:
break
# TODO: Use REPLY variable here too?
for i in xrange(n):
for i in xrange(max_results):
try:
s = strs[i]
s = parts[i]
except IndexError:
s = '' # if there are too many variables
#log('read: %s = %s', names[i], s)
View
@@ -5,6 +5,7 @@
import unittest
from core import legacy
from core import lexer
from core import builtin # module under test
@@ -17,39 +18,18 @@ def testEchoLexer(self):
print list(lex.Tokens(r'unicode \u0065 \U00000065'))
print list(lex.Tokens(r'\d \e \f \g'))
def testSplitLine(self):
# NOTE: This function can be rewritten in C or C++. Use ASAN / fuzzing?
# It's similar to the functions in core/glob_.py.
#
# Can you use regexes? Need different regexes for "allow_escape".
# Nah I think I need to rewrite _IfsSplit in word_eval.c.
# That is a very similar function.
#
# Can you lift it from dash? The other shells all GPL.
# word_eval_test._IfsSplit has at least one bug! With IFS='_ '. Maybe
# should test that here.
DEFAULT_IFS = ' \t\n'
OTHER_IFS = ':'
def testAppendParts(self):
# allow_escape is True by default, but False when the user passes -r.
CASES = [
#(' one two ', DEFAULT_IFS, False, ['one', 'two'], False),
(' one:two ', OTHER_IFS, True, [' one', 'two '], False),
(' one\:two ', OTHER_IFS, True, [' one:two '], False),
(' one\:two ', OTHER_IFS, False, [r' one\', two '], False),
(['Aa', 'b', ' a b'], 'Aa b \\ a\\ b'),
]
# Not worknig yet!
return
for line, ifs, allow_escape, expected_parts, expected_c in CASES:
parts, continued = builtin._SplitLine(line, ifs, allow_escape)
self.assertEqual(expected_parts, parts,
'%r: %s != %s' % (line, expected_parts, parts))
self.assertEqual(expected_c, continued,
'%r: %s != %s' % (line, expected_c, continued))
for expected_parts, line in CASES:
sp = legacy.IfsSplitter(legacy.DEFAULT_IFS, '')
spans = sp.Split(line, True)
parts = []
builtin._AppendParts(line, spans, 100, False, parts)
self.assertEqual(expected_parts, parts)
if __name__ == '__main__':
View
@@ -160,7 +160,7 @@ def _GlobUnescape(s): # used by cmd_exec
while i < n:
c = s[i]
if c == '\\':
assert i != n - 1, 'There should be no trailing single backslash!'
assert i != n - 1, 'Trailing backslash: %r' % s
i += 1
c2 = s[i]
if c2 in GLOB_META_CHARS:
Oops, something went wrong.

0 comments on commit a034751

Please sign in to comment.