From 8066fd72df83396d9301c2455575538ebd936ad2 Mon Sep 17 00:00:00 2001 From: Andy Chu Date: Mon, 14 Aug 2017 01:46:29 -0700 Subject: [PATCH] Implement Patsub ${x/pat/replace} and strip ops ${x#prefix}, etc. There are two strategies, depending on the pattern. 1) Fixed strings use Python's string methods, e.g. startswith/endswith/replace/slice. 2) Glob patterns are converted to Python regexes. (Character classes aren't currently supported.) Then we use the regex engine for position information and greedy/non-greedy matches. Also: - Added tests. - Fix parsing. - TODO: Unicode Addresses issue #26. --- core/glob_.py | 90 +++++++++++------ core/glob_test.py | 34 +++++-- core/word_eval.py | 197 ++++++++++++++++++++++---------------- osh/word_parse.py | 30 +++--- osh/word_parse_test.py | 4 +- spec/var-op-other.test.sh | 15 +++ test/spec.sh | 4 +- 7 files changed, 241 insertions(+), 133 deletions(-) diff --git a/core/glob_.py b/core/glob_.py index 42aab8c2da..40d3b50a09 100644 --- a/core/glob_.py +++ b/core/glob_.py @@ -3,6 +3,8 @@ glob_.py """ +import re + import libc from core.util import log @@ -55,56 +57,88 @@ def GlobEscape(s): return escaped -def GlobToExtendedRegex(g): - """Convert a glob to a libc extended regexp. +# We need to handle glob patterns, but fnmatch doesn't give you the positions +# of matches. So we convert globs to regexps. - For ${s//pat*/__}. +# There are two regex engines we can use. Each has advantages and +# disadvantages: - We need to use regcomp/regex because fnmatch doesn't give you the positions - of matches. +# Python regex: +# - Supports Greedy vs. Non-greedy (necessary for strip ops, but not patsub) +# - Doesn't rely on global variables for unicode. I think libc string +# functions use LOCALE? + +# ERE: +# - Linear time algorithm +# - Save code space +# - Supports the same character classes as glob. - Why not use Python? To avoid backtracking? I think we should just Python - here. Because we want Unicode to be consistent too. - - What other string ops are there? +def GlobToExtendedRegex(g): + """Convert a glob to a libc extended regexp. Returns: A ERE string, or None if it's the pattern is a constant string rather than a glob. """ - # NOTE: character classes are retained literally, since EREs have the same - # char class syntax? + # Could be used for ${s//pat*/__}, but NOT # ## % %%. + # We'll use Python everywhere for simplicity. + raise NotImplementedError -def GlobToPythonRegex(g, longest=True): +def GlobToPythonRegex(s, greedy=True): """Convert a glob to a libc extended regexp. Args: - longest: whether * should be '.*' (greedy) or '.*?' (non-greedy) - - We need Python's engine for greedy and non-greedy matches. libc doesn't have - that. - - For string ops like ${s#'*b'} + greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy) NOTE: character classes aren't supported. Returns: A Python regex string, or None if it's the pattern is a constant string rather than a glob. + + regex, err? """ - return None - # TODO: - # - Iterate through each characater - # - Check for escapes - # - If it - - if longest: - pass + star_pat = '.*' if greedy else '.*?' + + is_glob = False + err = None + + i = 0 + n = len(s) + out = [] + while i < n: + c = s[i] + if c == '\\': # glob escape like \* or \? + i += 1 + out.append(s[i]) + elif c == '*': + is_glob = True + out.append(star_pat) + elif c == '?': + is_glob = True + out.append('.') + # TODO: Should we enter a different state and parse these? + elif c == '[': + err = True # TODO: better error + break + elif c == ']': + err = True + else: + # e.g. . -> \. + out.append(re.escape(c)) + + i += 1 + + if err: + return None, err else: - pass - return '^' + '$' + if is_glob: + regex = ''.join(out) + else: + regex = None + return regex, err def _GlobUnescape(s): # used by cmd_exec diff --git a/core/glob_test.py b/core/glob_test.py index 4e207a6b31..53fcbf6295 100755 --- a/core/glob_test.py +++ b/core/glob_test.py @@ -77,12 +77,35 @@ def testPatSubRegexes(self): # x=~/git/oil # ${x//git*/X/} - # NOTE: This should be regcomp - r = re.compile('(^.*)git.*(.*)') - - result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil') + # git* + r1 = re.compile('git.*') + result = r1.sub('X', '~/git/oil') self.assertEqual('~/X', result) + r2 = re.compile('[a-z]') + result = r2.sub('X', 'a-b-c') + self.assertEqual('X-X-X', result) + + # Substitute the first one only + r2 = re.compile('[a-z]') + result = r2.sub('X', 'a-b-c', count=1) + self.assertEqual('X-b-c', result) + + def testGlobToPythonRegex(self): + CASES = [ + # glob input, (regex, err) + ('*.py', '.*\.py', None), + ('*.?', '.*\..', None), + ('abc', None, None), + ('[[:space:]]', None, True), + ] + for glob, expected_regex, expected_err in CASES: + regex, err = glob_.GlobToPythonRegex(glob) + self.assertEqual(expected_regex, regex, + '%s: expected %r, got %r' % (glob, expected_regex, regex)) + self.assertEqual(expected_err, err, + '%s: expected %r, got %r' % (glob, expected_err, err)) + def testPatSubRegexesLibc(self): r = libc.regex_parse('^(.*)git.*(.*)') print(r) @@ -94,8 +117,5 @@ def testPatSubRegexesLibc(self): # We have to keep advancing the string until there are no more matches. - - - if __name__ == '__main__': unittest.main() diff --git a/core/word_eval.py b/core/word_eval.py index 89ea848a22..976c405028 100644 --- a/core/word_eval.py +++ b/core/word_eval.py @@ -268,6 +268,115 @@ def _DecayPartValuesToString(part_vals, join_char): return ''.join(out) +# TODO: +# - Unicode support: Convert both pattern, string, and replacement to unicode, +# then the result back at the end. +# - Add location info to errors. Maybe pass spid pair all the way down. +# - Compile time errors for [[:space:]] ? + +def _DoUnarySuffixOp(s, op, arg): + """Helper for ${x#prefix} and family.""" + + pat_re, err = glob_.GlobToPythonRegex(arg) + if err: + e_die("Can't convert glob to regex: %r", arg) + + if pat_re is None: # simple/fast path for fixed strings + if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix + if s.startswith(arg): + return s[len(arg):] + else: + return s + + elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix + if s.endswith(arg): + # Mutate it so we preserve the flags. + return s[:-len(arg)] + else: + return s + + else: # e.g. ^ ^^ , ,, + raise AssertionError(op.op_id) + + else: # glob pattern + # Extract the group from the regex and return it + if op.op_id == Id.VOp1_Pound: # shortest prefix + # Need non-greedy match + pat_re2, err = glob_.GlobToPythonRegex(arg, greedy=False) + r = re.compile(pat_re2) + m = r.match(s) + if m: + return s[m.end(0):] + else: + return s + + elif op.op_id == Id.VOp1_DPound: # longest prefix + r = re.compile(pat_re) + m = r.match(s) + if m: + return s[m.end(0):] + else: + return s + + elif op.op_id == Id.VOp1_Percent: # shortest suffix + # NOTE: This is different than re.search, which will find the longest suffix. + r = re.compile('^(.*)' + pat_re + '$') + m = r.match(s) + if m: + return m.group(1) + else: + return s + + elif op.op_id == Id.VOp1_DPercent: # longest suffix + r = re.compile('^(.*?)' + pat_re + '$') # non-greedy + m = r.match(s) + if m: + return m.group(1) + else: + return s + + else: + raise AssertionError(op.op_id) + + +def _PatSub(s, op, pat, replace_str): + """Helper for ${x/pat/replace}.""" + #log('PAT %r REPLACE %r', pat, replace_str) + py_regex, err = glob_.GlobToPythonRegex(pat) + if err: + e_die("Can't convert glob to regex: %r", pat) + + if py_regex is None: # Simple/fast path for fixed strings + if op.do_all: + return s.replace(pat, replace_str) + elif op.do_prefix: + if s.startswith(pat): + n = len(pat) + return replace_str + s[n:] + else: + return s + elif op.do_suffix: + if s.endswith(pat): + n = len(pat) + return s[:-n] + replace_str + else: + return s + else: + return s.replace(pat, replace_str, 1) # just the first one + + else: + count = 1 # replace first occurrence only + if op.do_all: + count = 0 # replace all + elif op.do_prefix: + py_regex = '^' + py_regex + elif op.do_suffix: + py_regex = py_regex + '$' + + pat_re = re.compile(py_regex) + return pat_re.sub(replace_str, s, count) + + # Eval is for ${a-} and ${a+}, Error is for ${a?}, and Assign is for ${a=} Effect = util.Enum('Effect', 'SpliceParts Error SpliceAndAssign NoOp'.split()) @@ -449,104 +558,30 @@ def _ApplyPrefixOp(self, val, op_id): else: raise AssertionError(op_id) - # TODO: Make a free function - def _DoUnarySuffixOp(self, s, op, arg): - """Helper for ${x#prefix} and family.""" - - # op_id? - py_regex = glob_.GlobToPythonRegex(arg) - if py_regex is not None: - # Extract the group from the regex and return it - - if op.op_id == Id.VOp1_Pound: # shortest prefix - raise NotImplementedError - elif op.op_id == Id.VOp1_DPound: # longest prefix - raise NotImplementedError - - elif op.op_id == Id.VOp1_Percent: # shortest suffix - raise NotImplementedError - elif op.op_id == Id.VOp1_DPercent: # longest suffix - raise NotImplementedError - else: - raise AssertionError(op.op_id) - - else: # fixed string - if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix - if s.startswith(arg): - return s[len(arg):] - else: - return s - - elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix - if s.endswith(arg): - # Mutate it so we preserve the flags. - return s[:-len(arg)] - else: - return s - - else: - raise AssertionError(op.op_id) - - # TODO: Make a free function - def _PatSub(self, s, op, pat, replace_str): - """Helper for ${x/pat/replace}.""" - #log('PAT %r REPLACE %r', pat, replace_str) - ere = glob_.GlobToExtendedRegex(pat) - # This can't fail? ere must be valid? - return libc.regex_replace(ere, replace_str, s, op.do_all) - def _ApplyUnarySuffixOp(self, val, op): - # NOTES: - # - These are VECTORIZED on arrays - # - I want to allow this with my shell dialect: @{files|slice 1 - # 2|upper} does the same thing to all of them. - # - How to do longest and shortest possible match? bash and mksh both - # call fnmatch() in a loop, with possible short-circuit optimizations. - # - TODO: Write a test program to show quadratic behavior? - # - original AT&T ksh has special glob routines that returned the match - # positions. - # Implementation: - # - Test if it is a LITERAL or a Glob. Then do a plain string - # operation. - # - If it's a glob, do the quadratic algorithm. - # - NOTE: bash also has an optimization where it extracts the LITERAL - # parts of the string, and does a prematch. If none of them match, - # then it SKIPs the quadratic algorithm. - # - The real solution is to compile a glob to RE2, but I want to avoid - # that dependency right now... libc regex is good for a bunch of - # things. - # - Bash has WIDE CHAR support for this. With wchar_t. - # - All sorts of functions like xdupmbstowcs - # - # And then pat_subst() does some special cases. Geez. - assert val.tag != value_e.Undef op_kind = LookupKind(op.op_id) - new_val = None - # TODO: Vectorization should be factored out of all the branches. if op_kind == Kind.VOp1: #log('%s', op) arg_val = self.word_ev.EvalWordToString(op.arg_word, do_fnmatch=True) assert arg_val.tag == value_e.Str if val.tag == value_e.Str: - s = self._DoUnarySuffixOp(val.s, op, arg_val.s) + s = _DoUnarySuffixOp(val.s, op, arg_val.s) new_val = runtime.Str(s) else: # val.tag == value_e.StrArray: + # ${a[@]#prefix} is VECTORIZED on arrays. Oil should have this too. strs = [] for s in val.strs: - strs.append(self._DoUnarySuffixOp(s, op, arg_val.s)) + strs.append(_DoUnarySuffixOp(s, op, arg_val.s)) new_val = runtime.StrArray(strs) else: raise AssertionError(op_kind) - if new_val: - return new_val - else: - return val + return new_val def _EvalDoubleQuotedPart(self, part): # Example of returning array: @@ -770,16 +805,16 @@ def _EvalBracedVarSub(self, part, quoted): pat = pat_val.s if val.tag == value_e.Str: - s = self._PatSub(val.s, op, pat, replace_str) + s = _PatSub(val.s, op, pat, replace_str) val = runtime.Str(s) elif val.tag == value_e.StrArray: strs = [] for s in val.strs: - strs.append(self._PatSub(s, op, pat, replace_str)) + strs.append(_PatSub(s, op, pat, replace_str)) val = runtime.StrArray(strs) else: - raise AssertionError(val.tag) + raise AssertionError(val.__class__.__name__) elif op.tag == suffix_op_e.Slice: # Either string slicing or array slicing. However string slicing has diff --git a/osh/word_parse.py b/osh/word_parse.py index bc8827365f..e427705e9f 100644 --- a/osh/word_parse.py +++ b/osh/word_parse.py @@ -197,22 +197,26 @@ def _ReadPatSubVarOp(self, lex_mode): p = ast.LiteralPart(self.cur_token) pat.parts.append(p) - # Check for other modifiers - first_part = pat.parts[0] - if first_part.tag == word_part_e.LiteralPart: - lit_id = first_part.token.id - if lit_id == Id.Lit_Slash: - do_all = True - pat.parts.pop(0) - elif lit_id == Id.Lit_Percent: - do_prefix = True - pat.parts.pop(0) - elif lit_id == Id.Lit_Pound: - do_suffix = True - pat.parts.pop(0) + if len(pat.parts) == 0: + self._BadToken("Pattern must not be empty: %r", token=self.cur_token) + return None + else: + first_part = pat.parts[0] + if first_part.tag == word_part_e.LiteralPart: + lit_id = first_part.token.id + if lit_id == Id.Lit_Slash: + do_all = True + pat.parts.pop(0) + elif lit_id == Id.Lit_Pound: + do_prefix = True + pat.parts.pop(0) + elif lit_id == Id.Lit_Percent: + do_suffix = True + pat.parts.pop(0) #self._Peek() if self.token_type == Id.Right_VarSub: + # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string return ast.PatSub(pat, None, do_all, do_prefix, do_suffix) elif self.token_type == Id.Lit_Slash: diff --git a/osh/word_parse_test.py b/osh/word_parse_test.py index f137536a5a..b22eda8101 100755 --- a/osh/word_parse_test.py +++ b/osh/word_parse_test.py @@ -198,13 +198,13 @@ def testPatSub(self): w = _assertReadWord(self, '${var/%pat/replace}') # prefix op = _GetSuffixOp(self, w) - self.assertTrue(op.do_prefix) + self.assertTrue(op.do_suffix) self.assertUnquoted('pat', op.pat) self.assertUnquoted('replace', op.replace) w = _assertReadWord(self, '${var/#pat/replace}') # suffix op = _GetSuffixOp(self, w) - self.assertTrue(op.do_suffix) + self.assertTrue(op.do_prefix) self.assertUnquoted('pat', op.pat) self.assertUnquoted('replace', op.replace) diff --git a/spec/var-op-other.test.sh b/spec/var-op-other.test.sh index 86a5afa575..0717f1a3df 100644 --- a/spec/var-op-other.test.sh +++ b/spec/var-op-other.test.sh @@ -53,6 +53,13 @@ echo ${s/?xx/_yy} ${s/%?xx/_yy} # N-I dash status: 2 # N-I dash stdout-json: "" +### Replace fixed strings +s=xx_xx +echo ${s/xx/yy} ${s//xx/yy} ${s/#xx/yy} ${s/%xx/yy} +# stdout: yy_xx yy_yy yy_xx xx_yy +# N-I dash status: 2 +# N-I dash stdout-json: "" + ### Replace is longest match # If it were shortest, then you would just replace the first s='begin end' @@ -91,6 +98,14 @@ echo status=$? # BUG bash/mksh status: 0 # BUG bash/mksh stdout-json: "-a/b/c-\nstatus=0\n" +### ${v/a} is the same as ${v/a/} -- no replacement string +v='aabb' +echo ${v/a} +echo status=$? +# stdout-json: "abb\nstatus=0\n" +# N-I dash stdout-json: "" +# N-I dash status: 2 + ### String slice foo=abcdefg echo ${foo:1:3} diff --git a/test/spec.sh b/test/spec.sh index 2aadea8722..2bea6cdf31 100755 --- a/test/spec.sh +++ b/test/spec.sh @@ -349,12 +349,12 @@ var-op-test() { } var-op-other() { - sh-spec spec/var-op-other.test.sh --osh-failures-allowed 10 \ + sh-spec spec/var-op-other.test.sh --osh-failures-allowed 5 \ ${REF_SHELLS[@]} $OSH "$@" } var-op-strip() { - sh-spec spec/var-op-strip.test.sh --osh-failures-allowed 5 \ + sh-spec spec/var-op-strip.test.sh --osh-failures-allowed 1 \ ${REF_SHELLS[@]} $ZSH $OSH "$@" }