Permalink
Browse files

Fix bugs with errexit. Found by gold.sh tests.

Unrelated:
- Testing and planning out string operations
  - Skeleton for PatSub, vectorize it
  - Implement vectorization of StringUnary ops
  - Add a test for longest match behavior of PatSub
  - Other tests
- Planning out regex_replace API (might go away)
  • Loading branch information...
Andy Chu
Andy Chu committed Aug 13, 2017
1 parent b70a401 commit 118089e788fb794650125ab0abddbba01fe8bdd7
Showing with 224 additions and 90 deletions.
  1. +52 −0 core/glob_.py
  2. +44 −0 core/glob_test.py
  3. +88 −84 core/word_eval.py
  4. +29 −3 native/libc_test.py
  5. +8 −0 spec/var-op-other.test.sh
  6. +2 −2 spec/var-op-strip.test.sh
  7. +1 −1 test/spec.sh
View
@@ -55,6 +55,58 @@ def GlobEscape(s):
return escaped
def GlobToExtendedRegex(g):
"""Convert a glob to a libc extended regexp.
For ${s//pat*/__}.
We need to use regcomp/regex because fnmatch doesn't give you the positions
of matches.
Why not use Python? To avoid backtracking? I think we should just Python
here. Because we want Unicode to be consistent too.
What other string ops are there?
Returns:
A ERE string, or None if it's the pattern is a constant string rather than
a glob.
"""
# NOTE: character classes are retained literally, since EREs have the same
# char class syntax?
def GlobToPythonRegex(g, longest=True):
"""Convert a glob to a libc extended regexp.
Args:
longest: whether * should be '.*' (greedy) or '.*?' (non-greedy)
We need Python's engine for greedy and non-greedy matches. libc doesn't have
that.
For string ops like ${s#'*b'}
NOTE: character classes aren't supported.
Returns:
A Python regex string, or None if it's the pattern is a constant string
rather than a glob.
"""
return None
# TODO:
# - Iterate through each characater
# - Check for escapes
# - If it
if longest:
pass
else:
pass
return '^' + '$'
def _GlobUnescape(s): # used by cmd_exec
"""Remove glob escaping from a string.
View
@@ -3,8 +3,10 @@
glob_test.py: Tests for glob.py
"""
import re
import unittest
import libc
from core import glob_
@@ -52,6 +54,48 @@ def testLooksLikeGlob(self):
self.assertEqual(expected, glob_.LooksLikeGlob(pat),
'%s: expected %r' % (pat, expected))
def testGlobStripRegexes(self):
s = 'aabbccdd'
# ${v%c*} # shortest suffix
m = re.match('^(.*)c.*$', s)
self.assertEqual('aabbc', m.group(1))
# ${v%%c*} # longest suffix
m = re.match('^(.*?)c.*$', s)
self.assertEqual('aabb', m.group(1))
# ${v#*b} # shortest prefix
m = re.match('^.*?b(.*)$', s)
self.assertEqual('bccdd', m.group(1))
# ${v##*b} # longest prefix
m = re.match('^.*b(.*)$', s)
self.assertEqual('ccdd', m.group(1))
def testPatSubRegexes(self):
# x=~/git/oil
# ${x//git*/X/}
# NOTE: This should be regcomp
r = re.compile('(^.*)git.*(.*)')
result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil')
self.assertEqual('~/X', result)
def testPatSubRegexesLibc(self):
r = libc.regex_parse('^(.*)git.*(.*)')
print(r)
# It matches. But we need to get the positions out!
print libc.regex_match('^(.*)git.*(.*)', '~/git/oil')
# Or should we make a match in a loop?
# We have to keep advancing the string until there are no more matches.
if __name__ == '__main__':
unittest.main()
View
@@ -6,6 +6,8 @@
import re
import sys
import libc
from core import braces
from core import expr_eval
from core import glob_
@@ -437,7 +439,8 @@ def _ApplyPrefixOp(self, val, op_id):
elif op_id == Id.VSub_Bang:
# NOTES:
# - Could translate to eval('$' + name) or eval("\$$name")
# - ${!array[@]} means something completely different. TODO: implement that.
# - ${!array[@]} means something completely different. TODO: implement
# that.
# - It might make sense to suggest implementing this with associative
# arrays?
@@ -446,8 +449,53 @@ def _ApplyPrefixOp(self, val, op_id):
else:
raise AssertionError(op_id)
def _ApplyOtherSuffixOp(self, val, op):
# TODO: Make a free function
def _DoUnarySuffixOp(self, s, op, arg):
"""Helper for ${x#prefix} and family."""
# op_id?
py_regex = glob_.GlobToPythonRegex(arg)
if py_regex is not None:
# Extract the group from the regex and return it
if op.op_id == Id.VOp1_Pound: # shortest prefix
raise NotImplementedError
elif op.op_id == Id.VOp1_DPound: # longest prefix
raise NotImplementedError
elif op.op_id == Id.VOp1_Percent: # shortest suffix
raise NotImplementedError
elif op.op_id == Id.VOp1_DPercent: # longest suffix
raise NotImplementedError
else:
raise AssertionError(op.op_id)
else: # fixed string
if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
if s.startswith(arg):
return s[len(arg):]
else:
return s
elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
if s.endswith(arg):
# Mutate it so we preserve the flags.
return s[:-len(arg)]
else:
return s
else:
raise AssertionError(op.op_id)
# TODO: Make a free function
def _PatSub(self, s, op, pat, replace_str):
"""Helper for ${x/pat/replace}."""
#log('PAT %r REPLACE %r', pat, replace_str)
ere = glob_.GlobToExtendedRegex(pat)
# This can't fail? ere must be valid?
return libc.regex_replace(ere, replace_str, s, op.do_all)
def _ApplyUnarySuffixOp(self, val, op):
# NOTES:
# - These are VECTORIZED on arrays
# - I want to allow this with my shell dialect: @{files|slice 1
@@ -475,94 +523,25 @@ def _ApplyOtherSuffixOp(self, val, op):
assert val.tag != value_e.Undef
op_kind = LookupKind(op.op_id)
new_val = None
# TODO: Vectorization should be factored out of all the branches.
if op_kind == Kind.VOp1:
#log('%s', op)
arg_val = self.word_ev.EvalWordToString(op.arg_word, do_fnmatch=True)
assert arg_val.tag == value_e.Str
looks_like_glob = False
if looks_like_glob:
if op.op_id == Id.VOp1_Pound: # shortest prefix
raise NotImplementedError
elif op.op_id == Id.VOp1_DPound: # longest prefix
raise NotImplementedError
elif op.op_id == Id.VOp1_Percent: # shortest suffix
raise NotImplementedError
elif op.op_id == Id.VOp1_DPercent: # longest suffix
raise NotImplementedError
else:
raise AssertionError(op.op_id)
else:
op_str = arg_val.s
# TODO: Factor these out into a common fuction?
if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
prefix = op_str
if val.tag == value_e.Str:
if val.s.startswith(prefix):
# Mutate it so we preserve the flags.
new_val = runtime.Str(val.s[len(prefix):])
else:
#log("Str: %r doesn't end with %r", val.s, suffix)
pass
elif val.tag == value_e.StrArray:
new_val = runtime.StrArray()
for i, s in enumerate(val.strs):
if s.startswith(prefix):
# Mutate it so we preserve the flags.
new_s = s[len(prefix):]
#log('%s -> %s', s, s[:-len(suffix)])
else:
new_s = s
#log("Array: %r doesn't end with %r", s, suffix)
new_val.strs.append(new_s)
elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
suffix = op_str
if val.tag == value_e.Str:
if val.s.endswith(suffix):
# Mutate it so we preserve the flags.
new_val = runtime.Str(val.s[:-len(suffix)])
else:
#log("Str: %r doesn't end with %r", val.s, suffix)
pass
elif val.tag == value_e.StrArray:
new_val = runtime.StrArray()
for i, s in enumerate(val.strs):
if s.endswith(suffix):
# Mutate it so we preserve the flags.
new_s = s[:-len(suffix)]
#log('%s -> %s', s, s[:-len(suffix)])
else:
new_s = s
#log("Array: %r doesn't end with %r", s, suffix)
new_val.strs.append(new_s)
else:
raise AssertionError(op.op_id)
elif op_kind == Kind.VOp2:
if op.op_id == Id.VOp2_Slash: # PatSub, vectorized
raise NotImplementedError
# Either string slicing or array slicing. However string slicing has a
# unicode problem? TODO: Test bash out. We need utf-8 parsing in C++?
#
# Or maybe have a different operator for byte slice and char slice.
elif op.op_id == Id.VOp2_Colon:
raise NotImplementedError
if val.tag == value_e.Str:
s = self._DoUnarySuffixOp(val.s, op, arg_val.s)
new_val = runtime.Str(s)
else: # val.tag == value_e.StrArray:
strs = []
for s in val.strs:
strs.append(self._DoUnarySuffixOp(s, op, arg_val.s))
new_val = runtime.StrArray(strs)
else:
raise NotImplementedError(op)
raise AssertionError(op_kind)
if new_val:
return new_val
@@ -775,12 +754,37 @@ def _EvalBracedVarSub(self, part, quoted):
else:
val = self._EmptyStrOrError(val) # maybe error
# Other suffix: value -> value
val = self._ApplyOtherSuffixOp(val, part.suffix_op)
val = self._ApplyUnarySuffixOp(val, part.suffix_op)
elif op.tag == suffix_op_e.PatSub:
raise NotImplementedError(op)
elif op.tag == suffix_op_e.PatSub: # PatSub, vectorized
pat_val = self.word_ev.EvalWordToString(op.pat, do_fnmatch=True)
assert pat_val.tag == value_e.Str, pat_val
if op.replace:
replace_val = self.word_ev.EvalWordToString(op.replace,
do_fnmatch=True)
assert replace_val.tag == value_e.Str, replace_val
replace_str = replace_val.s
else:
replace_str = ''
pat = pat_val.s
if val.tag == value_e.Str:
s = self._PatSub(val.s, op, pat, replace_str)
val = runtime.Str(s)
elif val.tag == value_e.StrArray:
strs = []
for s in val.strs:
strs.append(self._PatSub(s, op, pat, replace_str))
val = runtime.StrArray(strs)
else:
raise AssertionError(val.tag)
elif op.tag == suffix_op_e.Slice:
# Either string slicing or array slicing. However string slicing has
# a unicode problem?
# Or maybe have a different operator for byte slice and char slice.
raise NotImplementedError(op)
# After applying suffixes, process decay_array here.
View
@@ -53,11 +53,37 @@ def testRegex(self):
print(libc.regex_parse(r'*'))
print(libc.regex_parse('\\'))
print(libc.regex_match(r'.*\.py', 'foo.py'))
print(libc.regex_match(r'.*\.py', 'abcd'))
# error
cases = [
(r'.*\.py', 'foo.py', True),
(r'.*\.py', 'abcd', False),
# The match is unanchored
(r'bc', 'abcd', True),
# The match is unanchored
(r'.c', 'abcd', True),
]
for pat, s, expected in cases:
actual = libc.regex_match(pat, s)
self.assertEqual(expected, actual)
# Error.
print(libc.regex_match(r'*', 'abcd'))
def testRegexReplace(self):
cases = [
(r'.\.py', 'X', 'foo.py', False, 'foX'),
(r'^\.py', 'X', 'foo.py', False, 'foo.py'), # Anchored left
(r'foo$', 'X', 'foo.py', False, 'foo.py'), # Anchored Right
(r'o', 'X', 'foo.py', False, 'fXo.py'), # replace all
(r'o', 'X', 'foo.py', True, 'fXX.py'), # replace all
]
return
for pat, replace, s, do_all, expected in cases:
actual = libc.regex_replace(pat, replace, s, do_all)
self.assertEqual(expected, actual)
if __name__ == '__main__':
unittest.main()
@@ -53,6 +53,14 @@ echo ${s/?xx/_yy} ${s/%?xx/_yy}
# N-I dash status: 2
# N-I dash stdout-json: ""
### Replace is longest match
# If it were shortest, then you would just replace the first <html>
s='begin <html></html> end'
echo ${s/<*>/[]}
# stdout: begin [] end
# N-I dash status: 2
# N-I dash stdout-json: ""
### Replace char class
s=xx_xx_xx
echo ${s//[[:alpha:]]/y} ${s//[^[:alpha:]]/-}
Oops, something went wrong.

0 comments on commit 118089e

Please sign in to comment.