Permalink
Browse files

Fix more ${//} cases by properly parsing the glob into an AST (#134)

- character classes now work (a case in spec/var-op-other.test.sh)
- hard cases like \**+ are now properly parsed an executed.

There are still a few IndexError crash bugs in the character class parser, however.
  • Loading branch information...
aykamko authored and andychu committed Jun 15, 2018
1 parent 2503f25 commit 479782ed504b0fb4401952cd0f8fc324cdc00193
Showing with 254 additions and 106 deletions.
  1. +2 −2 build/codegen.sh
  2. +8 −0 build/dev.sh
  3. +144 −51 core/glob_.py
  4. +68 −33 core/glob_test.py
  5. +4 −4 core/libstr.py
  6. +1 −1 osh/cmd_parse.py
  7. +7 −15 osh/glob.asdl
  8. +20 −0 osh/meta.py
View
@@ -32,8 +32,8 @@ source build/common.sh
# _devbuild/gen/
# osh-types.h - lex_mode_e for now
# id_kind.h - id_e for now
# osh-lex.re2c.c
# osh-lex.c
# osh-lex.re2c.c
# osh-lex.c
download-re2c() {
mkdir -p _deps
View
@@ -50,6 +50,13 @@ gen-runtime-asdl() {
echo "Wrote $out"
}
gen-glob-asdl() {
local out=_devbuild/gen/glob_asdl.py
local import='from osh.meta import GLOB_TYPE_LOOKUP as TYPE_LOOKUP'
PYTHONPATH=. asdl/gen_python.py osh/glob.asdl "$import" > $out
echo "Wrote $out"
}
# TODO: should fastlex.c be part of the dev build? It means you need re2c
# installed? I don't think it makes sense to have 3 builds, so yes I think we
# can put it here for simplicity.
@@ -102,6 +109,7 @@ minimal() {
gen-types-asdl
gen-osh-asdl
gen-runtime-asdl
gen-glob-asdl
pylibc
}
View
@@ -8,9 +8,12 @@
except ImportError:
from benchmarks import fake_libc as libc
from osh.meta import glob as glob_ast
from core import util
log = util.log
glob_part_e = glob_ast.glob_part_e
def LooksLikeGlob(s):
"""
@@ -67,67 +70,157 @@ def GlobEscape(s):
# always use utf-8?
# - Character class for glob is different than char class for regex? According
# to the standard, anyway.
# - Honestly I would like a more principled parser for globs! Can re2c do
# better here?
def GlobToExtendedRegex(glob_pat):
"""Convert a glob to a libc extended regexp.
# - Honestly I would like a more principled parser for globs! Can re2c do
# better here?
Returns:
A ERE string, or None if it's the pattern is a constant string rather than
a glob.
class GlobParser(object):
"""
Parses glob patterns. Can convert directly to libc extended regexp or output
an intermediate AST (defined at osh/glob.asdl).
"""
is_glob = False
err = None
i = 0
n = len(glob_pat)
out = []
while i < n:
c = glob_pat[i]
if c == '\\': # glob escape like \* or \?
# BUG: This isn't correct because \* is escaping a glob character, but
# then it's also a regex metacharacter. We should really parse the glob
# into a symbolic form first, not do text->text conversion.
# Hard test case: \** as a glob -> \*.* as a regex.
def Parse(self, glob_pat):
"""Parses a glob pattern into AST form (defined at osh/glob.asdl).
Returns:
A 2-tuple of (<glob AST>, <error message>).
If the pattern is not actually a glob, the first element is None. The
second element is None unless there was an error during parsing.
"""
try:
return self._ParseUnsafe(glob_pat)
except RuntimeError as e:
return None, str(e)
def _ParseUnsafe(self, glob_pat):
"""
Parses a glob pattern into AST form.
Raises:
RuntimeError: if glob is invalid
"""
is_glob = False
i = 0
n = len(glob_pat)
parts = []
while i < n:
c = glob_pat[i]
if c == '\\': # glob escape like \* or \?
i += 1
parts.append(glob_ast.Literal(glob_pat[i]))
elif c == '*':
is_glob = True
parts.append(glob_ast.Star())
elif c == '?':
is_glob = True
parts.append(glob_ast.QMark())
elif c == '[':
is_glob = True
char_class_expr, i = self.ParseCharClassExpr(glob_pat, i)
parts.append(char_class_expr)
else:
parts.append(glob_ast.Literal(c))
i += 1
out.append(glob_pat[i])
elif c == '*':
is_glob = True
out.append('.*')
elif c == '?':
is_glob = True
out.append('.')
# TODO: Enter a different state and parse character classes
# NOTE: Is [!abc] negation rather than [^abc] ?
elif c == '[':
err = True # TODO: better error
break
elif c == ']':
err = True
# Escape a single character for extended regex literals.""
# https://www.gnu.org/software/findutils/manual/html_node/find_html/posix_002dextended-regular-expression-syntax.html
elif c in '.|^$()+': # Already handled \ * ? []
out.append('\\' + c)
else:
out.append(c)
if is_glob:
return glob_ast.glob(parts), None
return None, None
def ParseCharClassExpr(self, glob_pat, start_i):
"""Parses a character class expression, e.g. [abc], [[:space:]], [!a-z]
Returns:
A 2-tuple of (<CharClassExpr instance>, <next parse index>)
Raises:
RuntimeError: If error during parsing the character class.
"""
i = start_i
if glob_pat[i] != '[':
raise RuntimeError('invalid CharClassExpr start!')
i += 1
# NOTE: Both ! and ^ work for negation in globs
# https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html#Pattern-Matching
negated = glob_pat[i] in '!^'
if negated:
i += 1
in_posix_class = False
expr_body = []
n = len(glob_pat)
# NOTE: special case: ] is matched iff it's the first char in the expression
if glob_pat[i] == ']':
expr_body.append(']')
i += 1
while i < n:
c = glob_pat[i]
if c == ']':
if not in_posix_class:
break
in_posix_class = False
elif c == '[':
if in_posix_class:
raise RuntimeError('invalid character [ in CharClassExpr')
in_posix_class = (glob_pat[i+1] == ':')
elif c == '\\':
expr_body.append(c)
i += 1
c = glob_pat[i]
expr_body.append(c)
i += 1
if err:
return None, err
else:
if is_glob:
regex = ''.join(out)
else:
regex = None
return regex, err
raise RuntimeError('unclosed CharClassExpr!')
return glob_ast.CharClassExpr(negated, ''.join(expr_body)), i
def ASTToExtendedRegex(self, ast):
if not ast:
return None
out = []
for part in ast.parts:
if part.tag == glob_part_e.Literal:
if part.s in '.|^$()+*?[]{}\\':
out.append('\\')
out.append(part.s)
elif part.tag == glob_part_e.Star:
out.append('.*')
elif part.tag == glob_part_e.QMark:
out.append('.')
elif part.tag == glob_part_e.CharClassExpr:
out.append('[')
if part.negated:
out.append('^')
out.append(part.body + ']')
return ''.join(out)
def GlobToExtendedRegex(self, glob_pat):
ast, err = self.Parse(glob_pat)
return self.ASTToExtendedRegex(ast), err
def _GlobUnescape(s): # used by cmd_exec
"""Remove glob escaping from a string.
Used when there is no glob match.
TODO: Can probably get rid of this, as long as you save the original word.
@@ -196,10 +289,10 @@ def Expand(self, arg):
if g:
return g
else: # Nothing matched
if self.exec_opts.failglob:
if self.exec_opts.failglob:
# TODO: Make the command return status 1.
raise NotImplementedError
if self.exec_opts.nullglob:
if self.exec_opts.nullglob:
return []
else:
# Return the original string
View
@@ -7,8 +7,9 @@
import re
import unittest
import libc
from asdl import py_meta
from core import glob_
from osh.meta import glob as g
class GlobEscapeTest(unittest.TestCase):
@@ -92,42 +93,76 @@ def testPatSubRegexes(self):
result = r2.sub('X', 'a-b-c', count=1)
self.assertEqual('X-b-c', result)
def testGlobToExtendedRegex(self):
def assertASTEqual(self, expected_ast, ast):
"""Asserts that 2 ASDL-defined ASTs are equal."""
expected_is_node = isinstance(expected_ast, py_meta.CompoundObj)
given_is_node = isinstance(ast, py_meta.CompoundObj)
if not expected_is_node and not given_is_node:
self.assertEqual(expected_ast, ast)
return
self.assertEqual(expected_ast.tag, ast.tag)
if not hasattr(expected_ast, '__slots__'):
return
self.assertEqual(expected_ast.__slots__, ast.__slots__)
for attr in expected_ast.__slots__:
exp_slot, slot = getattr(expected_ast, attr), getattr(ast, attr)
if isinstance(slot, list):
for exp_elem, elem in zip(exp_slot, slot):
self.assertASTEqual(exp_elem, elem)
else:
self.assertASTEqual(exp_slot, slot)
def testGlobParser(self):
CASES = [
# glob input, (regex, err)
('*.py', '.*\.py', None),
('*.?', '.*\..', None),
('<*>', '<.*>', None),
#('\\*', '\\*', None), # not a glob, a string
# Hard case: a literal * and then a glob
#('\\**', '\\**', None),
#('c:\\foo', 'c:\\\\foo', None),
('abc', None, None), # not a glob
# TODO: These should be parsed
('[[:space:]]', None, True),
('[abc]', None, True),
('[abc\[]', None, True),
# (glob input, expected AST, expected extended regexp, has error)
('*.py', [g.Star()] + [g.Literal(c) for c in '.py'], '.*\.py', False),
('*.?', [g.Star(), g.Literal('.'), g.QMark()], '.*\..', False),
('<*>', [g.Literal('<'), g.Star(), g.Literal('>')], '<.*>', False),
('\**+', [g.Literal('*'), g.Star(), g.Literal('+')], '\*.*\+', False),
('\**', [g.Literal('*'), g.Star()], '\*.*', False),
# not globs
('abc', None, None, False),
('\\*', None, None, False),
('c:\\foo', None, None, False),
('strange]one', None, None, False),
# character class globs
('[[:space:]abc]', [g.CharClassExpr(False, '[:space:]abc')], '[[:space:]abc]', False),
('[abc]', [g.CharClassExpr(False, 'abc')], '[abc]', False),
('[\a\b\c]', [g.CharClassExpr(False, '\a\b\c')], '[\a\b\c]', False),
('[abc\[]', [g.CharClassExpr(False, 'abc\[')], '[abc\[]', False),
('[!not]', [g.CharClassExpr(True, 'not')], '[^not]', False),
('[^also_not]', [g.CharClassExpr(True, 'also_not')], '[^also_not]', False),
('[]closed_bracket]', [g.CharClassExpr(False, ']closed_bracket')], '[]closed_bracket]', False),
('[!]closed_bracket]', [g.CharClassExpr(True, ']closed_bracket')], '[^]closed_bracket]', False),
('[!*?!\\[]', [g.CharClassExpr(True, '*?!\[')], '[^*?!\\[]', False),
('[!\]foo]', [g.CharClassExpr(True, '\]foo')], '[^\]foo]', False),
('wow[[[[]]]]', ([g.Literal(c) for c in 'wow'] +
[g.CharClassExpr(False, '[[[')] +
[g.Literal(c) for c in ']]']), 'wow[[[[]\]\]\]', False),
# invalid globs
('not_closed[a-z', None, None, True),
('[[:spa[ce:]]', None, None, True),
]
for glob, expected_regex, expected_err in CASES:
regex, err = glob_.GlobToExtendedRegex(glob)
self.assertEqual(expected_regex, regex,
'%s: expected %r, got %r' % (glob, expected_regex, regex))
self.assertEqual(expected_err, err,
for glob, expected_parts, expected_ere, expected_err in CASES:
if expected_parts:
expected_ast = g.glob(expected_parts)
else:
expected_ast = None
parser = glob_.GlobParser()
ast, err = parser.Parse(glob)
ere = parser.ASTToExtendedRegex(ast)
self.assertASTEqual(expected_ast, ast)
self.assertEqual(expected_ere, ere)
self.assertEqual(expected_err, err is not None,
'%s: expected %r, got %r' % (glob, expected_err, err))
def testPatSubRegexesLibc(self):
r = libc.regex_parse('^(.*)git.*(.*)')
print(r)
# It matches. But we need to get the positions out!
print(libc.regex_match('^(.*)git.*(.*)', '~/git/oil'))
# Or should we make a match in a loop?
# We have to keep advancing the string until there are no more matches.
if __name__ == '__main__':
unittest.main()
Oops, something went wrong.

0 comments on commit 479782e

Please sign in to comment.