Permalink
Browse files

Functions to 1) detect words that need brace expansion, and 2) expand…

… them.

Uses the new AltPart.  NumRangePart and CharRangePart are not used yet.

A lot of new test cases.  Every shell implements a different set of
brace expansions:

dash: no brace expansion at all
mksh: only {word1,word2} expansion
zsh: the above, plus numeric range expansion {1..10..2}
bash : the above, plus character range expansion {a..z..2}

The algorithm to detect expansion scans through the parts linearly,
creating a tree of CompoundWord/AltPart.  It pushes to a stack when it
sees { and popping when it sees }.

The algorithm to expand the tree has two recursive calls.  It's like
a Cartesian product with some extra features.

TODO:
- Unit tests are a bit messy and need to be cleaned up.
- Better error handling and tests for error cases
- Numeric range expansion, since two shells implement it
  • Loading branch information...
Andy Chu
Andy Chu committed Mar 11, 2017
1 parent ce1a3ac commit 43581dec1a85b748698d6aa592e268187f003b43
Showing with 520 additions and 5 deletions.
  1. +227 −0 core/braces.py
  2. +144 −0 core/braces_test.py
  3. +4 −0 osh/osh.asdl
  4. +2 −1 spec.sh
  5. +143 −4 tests/brace-expansion.test.sh
View
@@ -0,0 +1,227 @@
#!/usr/bin/env python3
"""
braces.py
NOTE: bash implements brace expansion in the braces.c file (835 lines). It
uses goto!
Possible optimization flags for CompoundWord:
- has Lit_LBrace, LitRBrace -- set during word_parse phase
- it if has both, then do _BraceDetect
- has AltPart -- set during _BraceDetect
- if it does, then do the expansion
- has Lit_Star, ?, [ ] -- globbing?
- but after expansion do you still have those flags?
"""
import sys
from core.id_kind import Id
from osh import ast_ as ast
word_part_e = ast.word_part_e
def _BraceDetect(w):
"""
Args:
CompoundWord
Returns:
CompoundWord or None?
Another option:
Grammar:
# an alternative is a literal, possibly empty, or another brace_expr
part = <any part except LiteralPart>
alt = part* | brace_expr
# a brace_expr is group of at least 2 braced and comma-separated
# alternatives, with optional prefix and suffix.
brace_expr = part* '{' alt ',' alt (',' alt)* '}' part*
Problem this grammar: it's not LL(1)
Is it indirect left-recursive?
What's the best way to handle it? LR(1) parser?
Iterative algorithm:
Parse it with a stack?
It's a stack that asserts there is at least one , in between {}
Yeah just go through and when you see {, push another list.
When you get , append to list
When you get } and at least one ',', appendt o list
When you get } without, then pop
If there is no matching }, then abort with error
if not balanced, return error too?
"""
# Errors:
# }a{ - stack depth dips below 0
# {a,b}{ - Stack depth doesn't end at 0
# {a} - no comma, and also not an numeric range
cur_parts = []
parts_stack = [] # stack of parts in progress
alt_stack = [] # stack of alternatives
for i, part in enumerate(w.parts):
append = True
if part.tag == word_part_e.LiteralPart:
id_ = part.token.id
if id_ == Id.Lit_LBrace:
# Save prefix parts. Start new parts list.
parts_stack.append(cur_parts)
cur_parts = []
alternatives = ast.AltPart()
alt_stack.append(alternatives)
append = False
elif id_ == Id.Lit_Comma:
# Append a new alternative.
#print('*** Appending after COMMA', cur_parts)
alt_stack[-1].words.append(ast.CompoundWord(cur_parts))
cur_parts = [] # clear
append = False
elif id_ == Id.Lit_RBrace:
# TODO:
# - Detect lack of , -- abort the whole thing
# - Detect {1..10} and {1..10..2}
# - bash and zsh only -- this is NOT implemented by mksh
# - Use a regex on the middle part:
# - digit+ '..' digit+ ( '..' digit+ )?
# - Char ranges are bash only!
#
# ast.NumRangePart()
# ast.CharRangePart()
alt_stack[-1].words.append(ast.CompoundWord(cur_parts))
# TODO: catch errors here
cur_parts = parts_stack.pop()
alternatives = alt_stack.pop()
cur_parts.append(alternatives) # TODO: Wrap in AltPart
append = False
if append:
cur_parts.append(part)
# TODO: Errors here
assert len(alt_stack) == 0
assert len(parts_stack) == 0
return ast.CompoundWord(cur_parts)
# Possible optmization for later:
def _TreeCount(tree_word):
"""Count output size for allocation purposes.
We can count the number of words expanded into, and the max number of parts
in a word.
Every word can have a differnt number of parts, e.g. -{'a'b,c}- expands into
words of 4 parts, then 3 parts.
"""
# TODO: Copy the structure of _BraceExpand and _BraceExpandOne.
for part in tree_word.parts:
if part.tag == word_part_e.AltPart:
for word in part.words:
pass
num_results = 2
max_parts = 5
return num_results , max_parts
def _BraceExpandOne(parts, first_alt_index, suffix):
"""Helper for _BraceExpand.
Args:
parts: input parts
first_alt_index: index of the first AltPart
suffix: the suffix to append
"""
out = []
# Need to call _BraceExpand on each of the inner words too!
first_alt = parts[first_alt_index]
expanded_alts = []
for w in first_alt.words:
expanded_alts.extend(_BraceExpand(w.parts))
prefix = parts[ : first_alt_index]
for alt_parts in expanded_alts:
out_parts = []
out_parts.extend(prefix)
out_parts.extend(alt_parts)
out_parts.extend(suffix)
# TODO: Do we need to preserve flags?
out.append(out_parts)
return out
def _BraceExpand(parts):
num_alts = 0
first_alt_index = -1
second_alt_index = -1
for i, part in enumerate(parts):
if part.tag == word_part_e.AltPart:
num_alts += 1
if num_alts == 1:
first_alt_index = i
elif num_alts == 2:
second_alt_index = i
break # don't need to count anymore
# NOTE: There are TWO recursive calls here, not just one -- one for
# nested {}, and one for adjacent {}. Thus it's hard to do iteratively.
if num_alts == 0:
return [parts]
elif num_alts == 1:
out = []
suffix = parts[first_alt_index+1 : ]
return _BraceExpandOne(parts, first_alt_index, suffix)
else:
# Now call it on the tail
tail_parts = parts[second_alt_index : ]
suffixes = _BraceExpand(tail_parts) # recursive call
out = []
for suffix in suffixes:
out.extend(_BraceExpandOne(parts, first_alt_index, suffix))
return out
def _Cartesian(tuples):
if len(tuples) == 1:
for x in tuples[0]:
yield (x,)
else:
for x in tuples[0]:
for y in _Cartesian(tuples[1:]):
yield (x,) + y # join tuples
def main(argv):
for t in _Cartesian([('a', 'b')]):
print(t)
print('--')
for t in _Cartesian([('a', 'b'), ('c', 'd', 'e'), ('f', 'g')]):
print(t)
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print('FATAL: %s' % e, file=sys.stderr)
sys.exit(1)
View
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
braces_test.py: Tests for braces.py
"""
from pprint import pprint
import sys
import unittest
from core import braces # module under test
from osh import word_parse_test
from osh import ast_ as ast
word_part_e = ast.word_part_e
# Silly wrapper
def _assertReadWord(*args):
return word_parse_test._assertReadWord(*args)
def _ColorPrint(n):
from asdl import format as fmt
ast_f = fmt.AnsiOutput(sys.stdout)
tree = fmt.MakeTree(n)
fmt.PrintTree(tree, ast_f)
class BracesTest(unittest.TestCase):
def testBraceDetect(self):
w = _assertReadWord(self, 'B-{a,b}-E')
tree = braces._BraceDetect(w)
self.assertEqual(3, len(tree.parts))
pprint(tree)
print('--')
# Multiple parts for each alternative
w = _assertReadWord(self, 'B-{a"a",b"b",c"c"}-E')
tree = braces._BraceDetect(w)
self.assertEqual(3, len(tree.parts))
pprint(tree)
print('--')
# Multiple expansion
w = _assertReadWord(self, 'B-{a,b}--{c,d}-E')
tree = braces._BraceDetect(w)
self.assertEqual(5, len(tree.parts))
pprint(tree)
print('--')
# Nested expansion
w = _assertReadWord(self, 'B-{a,b,c,={d,e}}-E')
tree = braces._BraceDetect(w)
pprint(tree)
self.assertEqual(3, len(tree.parts)) # B- {} -E
middle_part = tree.parts[1]
self.assertEqual(word_part_e.AltPart, middle_part.tag)
self.assertEqual(4, len(middle_part.words)) # a b c ={d,e}
last_alternative = middle_part.words[3]
self.assertEqual(2, len(last_alternative.parts)) # = {d,e}
second_part = last_alternative.parts[1]
self.assertEqual(word_part_e.AltPart, second_part.tag)
self.assertEqual(2, len(second_part.words)) # {d,e}
# Another nested expansion
w = _assertReadWord(self, 'B-{a,={b,c}=,d}-E')
tree = braces._BraceDetect(w)
pprint(tree)
self.assertEqual(3, len(tree.parts)) # B- {} -E
middle_part = tree.parts[1]
self.assertEqual(word_part_e.AltPart, middle_part.tag)
self.assertEqual(3, len(middle_part.words)) # a ={b,c}= d
first_alternative = middle_part.words[0]
_ColorPrint(first_alternative)
self.assertEqual(1, len(first_alternative.parts)) # a
#print('!!', first_alternative)
middle_alternative = middle_part.words[1]
self.assertEqual(3, len(middle_alternative.parts)) # = {b,c} =
middle_part2 = middle_alternative.parts[1]
self.assertEqual(word_part_e.AltPart, middle_part2.tag)
self.assertEqual(2, len(middle_part2.words)) # b c
# Third alternative is a CompoundWord with zero parts
w = _assertReadWord(self, '{a,b,}')
tree = braces._BraceDetect(w)
pprint(tree)
self.assertEqual(1, len(tree.parts))
self.assertEqual(3, len(tree.parts[0].words))
def testBraceExpand(self):
w = _assertReadWord(self, 'hi')
tree = braces._BraceDetect(w)
self.assertEqual(1, len(tree.parts))
pprint(tree)
results = braces._BraceExpand(tree.parts)
self.assertEqual(1, len(results))
for parts in results:
_ColorPrint(ast.CompoundWord(parts))
print('')
w = _assertReadWord(self, 'B-{a,b}-E')
tree = braces._BraceDetect(w)
self.assertEqual(3, len(tree.parts))
pprint(tree)
results = braces._BraceExpand(tree.parts)
self.assertEqual(2, len(results))
for parts in results:
_ColorPrint(ast.CompoundWord(parts))
print('')
w = _assertReadWord(self, 'B-{a,={b,c,d}=,e}-E')
tree = braces._BraceDetect(w)
self.assertEqual(3, len(tree.parts))
pprint(tree)
results = braces._BraceExpand(tree.parts)
self.assertEqual(5, len(results))
for parts in results:
_ColorPrint(ast.CompoundWord(parts))
print('')
w = _assertReadWord(self, 'B-{a,b}-{c,d}-E')
tree = braces._BraceDetect(w)
self.assertEqual(5, len(tree.parts))
pprint(tree)
results = braces._BraceExpand(tree.parts)
self.assertEqual(4, len(results))
for parts in results:
_ColorPrint(ast.CompoundWord(parts))
print('')
if __name__ == '__main__':
unittest.main()
View
@@ -72,6 +72,10 @@ module osh
| TildeSubPart(string prefix)
| CommandSubPart(command command_list)
| ArithSubPart(arith_expr anode)
| AltPart(word* words) -- {a,b,c} is a single AltPart
| NumRangePart(int start, int end, int? step) -- {1..10} or {1..10..2}
| CharRangePart(string start, string end, int? step)
-- {a..f} or {a..f..2} or {a..f..-2}
word =
TokenWord(token token)
View
@@ -336,7 +336,8 @@ dparen() {
brace-expansion() {
# NOTE: being a korn shell, mksh has brace expansion. But dash doesn't!
sh-spec tests/brace-expansion.test.sh $BASH $MKSH "$@"
# numeric ranges come from ZSH? mksh doesn't have them.
sh-spec tests/brace-expansion.test.sh $BASH $MKSH $ZSH "$@"
}
regex() {
Oops, something went wrong.

0 comments on commit 43581de

Please sign in to comment.