From 8066fd72df83396d9301c2455575538ebd936ad2 Mon Sep 17 00:00:00 2001
From: Andy Chu <andy@lisa>
Date: Mon, 14 Aug 2017 01:46:29 -0700
Subject: [PATCH] Implement Patsub ${x/pat/replace} and strip ops ${x#prefix},
 etc.

There are two strategies, depending on the pattern.

1) Fixed strings use Python's string methods, e.g.
startswith/endswith/replace/slice.

2) Glob patterns are converted to Python regexes.  (Character classes
aren't currently supported.)

Then we use the regex engine for position information and
greedy/non-greedy matches.

Also:

- Added tests.
- Fix parsing.
- TODO: Unicode

Addresses issue #26.
---
 core/glob_.py             |  90 +++++++++++------
 core/glob_test.py         |  34 +++++--
 core/word_eval.py         | 197 ++++++++++++++++++++++----------------
 osh/word_parse.py         |  30 +++---
 osh/word_parse_test.py    |   4 +-
 spec/var-op-other.test.sh |  15 +++
 test/spec.sh              |   4 +-
 7 files changed, 241 insertions(+), 133 deletions(-)

diff --git a/core/glob_.py b/core/glob_.py
index 42aab8c2da..40d3b50a09 100644
--- a/core/glob_.py
+++ b/core/glob_.py
@@ -3,6 +3,8 @@
 glob_.py
 """
 
+import re
+
 import libc
 
 from core.util import log
@@ -55,56 +57,88 @@ def GlobEscape(s):
   return escaped
 
 
-def GlobToExtendedRegex(g):
-  """Convert a glob to a libc extended regexp.
+# We need to handle glob patterns, but fnmatch doesn't give you the positions
+# of matches.  So we convert globs to regexps.
 
-  For ${s//pat*/__}.
+# There are two regex engines we can use.  Each has advantages and
+# disadvantages:
 
-  We need to use regcomp/regex because fnmatch doesn't give you the positions
-  of matches.
+# Python regex:
+# - Supports Greedy vs. Non-greedy (necessary for strip ops, but not patsub)
+# - Doesn't rely on global variables for unicode.  I think libc string
+#   functions use LOCALE?
+
+# ERE:
+# - Linear time algorithm
+# - Save code space
+# - Supports the same character classes as glob.
 
-  Why not use Python?  To avoid backtracking?  I think we should just Python
-  here.  Because we want Unicode to be consistent too.
-  
-  What other string ops are there?
 
+def GlobToExtendedRegex(g):
+  """Convert a glob to a libc extended regexp.
 
   Returns:
     A ERE string, or None if it's the pattern is a constant string rather than
     a glob.
   """
-  # NOTE: character classes are retained literally, since EREs have the same
-  # char class syntax?
+  # Could be used for ${s//pat*/__}, but NOT # ## % %%.
+  # We'll use Python everywhere for simplicity.
+  raise NotImplementedError
 
 
-def GlobToPythonRegex(g, longest=True):
+def GlobToPythonRegex(s, greedy=True):
   """Convert a glob to a libc extended regexp.
 
   Args:
-    longest: whether * should be '.*' (greedy) or '.*?' (non-greedy)
-
-  We need Python's engine for greedy and non-greedy matches.  libc doesn't have
-  that.
-
-  For string ops like ${s#'*b'}
+    greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy)
 
   NOTE: character classes aren't supported.
 
   Returns:
     A Python regex string, or None if it's the pattern is a constant string
     rather than a glob.
+
+    regex, err?
   """
-  return None
-  # TODO:
-  # - Iterate through each characater
-  # - Check for escapes
-  # - If it 
-
-  if longest:
-    pass
+  star_pat = '.*' if greedy else '.*?'
+
+  is_glob = False
+  err = None
+
+  i = 0
+  n = len(s)
+  out = []
+  while i < n:
+    c = s[i]
+    if c == '\\':  # glob escape like \* or \?
+      i += 1
+      out.append(s[i])
+    elif c == '*':
+      is_glob = True
+      out.append(star_pat)
+    elif c == '?':
+      is_glob = True
+      out.append('.')
+    # TODO: Should we enter a different state and parse these?
+    elif c == '[':
+      err = True  # TODO: better error
+      break
+    elif c == ']':
+      err = True
+    else:
+      # e.g. . -> \.
+      out.append(re.escape(c))
+
+    i += 1
+
+  if err:
+    return None, err
   else:
-    pass
-  return '^' + '$'
+    if is_glob:
+      regex = ''.join(out)
+    else:
+      regex = None
+    return regex, err
 
 
 def _GlobUnescape(s):  # used by cmd_exec
diff --git a/core/glob_test.py b/core/glob_test.py
index 4e207a6b31..53fcbf6295 100755
--- a/core/glob_test.py
+++ b/core/glob_test.py
@@ -77,12 +77,35 @@ def testPatSubRegexes(self):
     # x=~/git/oil
     # ${x//git*/X/}
 
-    # NOTE: This should be regcomp
-    r = re.compile('(^.*)git.*(.*)')
-
-    result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil')
+    # git*
+    r1 = re.compile('git.*')
+    result = r1.sub('X', '~/git/oil')
     self.assertEqual('~/X', result)
 
+    r2 = re.compile('[a-z]')
+    result = r2.sub('X', 'a-b-c')
+    self.assertEqual('X-X-X', result)
+
+    # Substitute the first one only
+    r2 = re.compile('[a-z]')
+    result = r2.sub('X', 'a-b-c', count=1)
+    self.assertEqual('X-b-c', result)
+
+  def testGlobToPythonRegex(self):
+    CASES = [
+        # glob input, (regex, err)
+        ('*.py', '.*\.py', None),
+        ('*.?', '.*\..', None),
+        ('abc', None, None),
+        ('[[:space:]]', None, True),
+    ]
+    for glob, expected_regex, expected_err in CASES:
+      regex, err = glob_.GlobToPythonRegex(glob)
+      self.assertEqual(expected_regex, regex,
+          '%s: expected %r, got %r' % (glob, expected_regex, regex))
+      self.assertEqual(expected_err, err,
+          '%s: expected %r, got %r' % (glob, expected_err, err))
+
   def testPatSubRegexesLibc(self):
     r = libc.regex_parse('^(.*)git.*(.*)')
     print(r)
@@ -94,8 +117,5 @@ def testPatSubRegexesLibc(self):
     # We have to keep advancing the string until there are no more matches.
 
 
-
-
-
 if __name__ == '__main__':
   unittest.main()
diff --git a/core/word_eval.py b/core/word_eval.py
index 89ea848a22..976c405028 100644
--- a/core/word_eval.py
+++ b/core/word_eval.py
@@ -268,6 +268,115 @@ def _DecayPartValuesToString(part_vals, join_char):
   return ''.join(out)
 
 
+# TODO:
+# - Unicode support: Convert both pattern, string, and replacement to unicode,
+#   then the result back at the end.
+# - Add location info to errors.  Maybe pass spid pair all the way down.
+#   - Compile time errors for [[:space:]] ?
+
+def _DoUnarySuffixOp(s, op, arg):
+  """Helper for ${x#prefix} and family."""
+
+  pat_re, err = glob_.GlobToPythonRegex(arg)
+  if err:
+    e_die("Can't convert glob to regex: %r", arg)
+
+  if pat_re is None:  # simple/fast path for fixed strings
+    if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound):  # const prefix
+      if s.startswith(arg):
+        return s[len(arg):]
+      else:
+        return s
+
+    elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent):  # const suffix
+      if s.endswith(arg):
+        # Mutate it so we preserve the flags.
+        return s[:-len(arg)]
+      else:
+        return s
+
+    else:  # e.g. ^ ^^ , ,,
+      raise AssertionError(op.op_id)
+
+  else:  # glob pattern
+    # Extract the group from the regex and return it
+    if op.op_id == Id.VOp1_Pound:  # shortest prefix
+      # Need non-greedy match
+      pat_re2, err = glob_.GlobToPythonRegex(arg, greedy=False)
+      r = re.compile(pat_re2)
+      m = r.match(s)
+      if m:
+        return s[m.end(0):]
+      else:
+        return s
+
+    elif op.op_id == Id.VOp1_DPound:  # longest prefix
+      r = re.compile(pat_re)
+      m = r.match(s)
+      if m:
+        return s[m.end(0):]
+      else:
+        return s
+
+    elif op.op_id == Id.VOp1_Percent:  # shortest suffix
+      # NOTE: This is different than re.search, which will find the longest suffix.
+      r = re.compile('^(.*)' + pat_re + '$')
+      m = r.match(s)
+      if m:
+        return m.group(1)
+      else:
+        return s
+      
+    elif op.op_id == Id.VOp1_DPercent:  # longest suffix
+      r = re.compile('^(.*?)' + pat_re + '$')  # non-greedy
+      m = r.match(s)
+      if m:
+        return m.group(1)
+      else:
+        return s
+
+    else:
+      raise AssertionError(op.op_id)
+
+
+def _PatSub(s, op, pat, replace_str):
+  """Helper for ${x/pat/replace}."""
+  #log('PAT %r REPLACE %r', pat, replace_str)
+  py_regex, err = glob_.GlobToPythonRegex(pat)
+  if err:
+    e_die("Can't convert glob to regex: %r", pat)
+
+  if py_regex is None:  # Simple/fast path for fixed strings
+    if op.do_all:
+      return s.replace(pat, replace_str)
+    elif op.do_prefix:
+      if s.startswith(pat):
+        n = len(pat)
+        return replace_str + s[n:]
+      else:
+        return s
+    elif op.do_suffix:
+      if s.endswith(pat):
+        n = len(pat)
+        return s[:-n] + replace_str
+      else:
+        return s
+    else:
+      return s.replace(pat, replace_str, 1)  # just the first one
+
+  else:
+    count = 1  # replace first occurrence only
+    if op.do_all:
+      count = 0  # replace all
+    elif op.do_prefix:
+      py_regex = '^' + py_regex
+    elif op.do_suffix:
+      py_regex = py_regex + '$'
+
+    pat_re = re.compile(py_regex)
+    return pat_re.sub(replace_str, s, count)
+
+
 # Eval is for ${a-} and ${a+}, Error is for ${a?}, and Assign is for ${a=}
 
 Effect = util.Enum('Effect', 'SpliceParts Error SpliceAndAssign NoOp'.split())
@@ -449,104 +558,30 @@ def _ApplyPrefixOp(self, val, op_id):
     else:
       raise AssertionError(op_id)
 
-  # TODO: Make a free function
-  def _DoUnarySuffixOp(self, s, op, arg):
-    """Helper for ${x#prefix} and family."""
-
-    # op_id?
-    py_regex = glob_.GlobToPythonRegex(arg)
-    if py_regex is not None:
-      # Extract the group from the regex and return it
-
-      if op.op_id == Id.VOp1_Pound:  # shortest prefix
-        raise NotImplementedError
-      elif op.op_id == Id.VOp1_DPound:  # longest prefix
-        raise NotImplementedError
-
-      elif op.op_id == Id.VOp1_Percent:  # shortest suffix
-        raise NotImplementedError
-      elif op.op_id == Id.VOp1_DPercent:  # longest suffix
-        raise NotImplementedError
-      else:
-        raise AssertionError(op.op_id)
-
-    else:  # fixed string
-      if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound):  # const prefix
-        if s.startswith(arg):
-          return s[len(arg):]
-        else:
-          return s
-
-      elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent):  # const suffix
-        if s.endswith(arg):
-          # Mutate it so we preserve the flags.
-          return s[:-len(arg)]
-        else:
-          return s
-
-      else:
-        raise AssertionError(op.op_id)
-
-  # TODO: Make a free function
-  def _PatSub(self, s, op, pat, replace_str):
-    """Helper for ${x/pat/replace}."""
-    #log('PAT %r REPLACE %r', pat, replace_str)
-    ere = glob_.GlobToExtendedRegex(pat)
-    # This can't fail?  ere must be valid?
-    return libc.regex_replace(ere, replace_str, s, op.do_all)
-
   def _ApplyUnarySuffixOp(self, val, op):
-    # NOTES:
-    # - These are VECTORIZED on arrays
-    #   - I want to allow this with my shell dialect: @{files|slice 1
-    #   2|upper} does the same thing to all of them.
-    # - How to do longest and shortest possible match?  bash and mksh both
-    #   call fnmatch() in a loop, with possible short-circuit optimizations.
-    #   - TODO: Write a test program to show quadratic behavior?
-    #   - original AT&T ksh has special glob routines that returned the match
-    #   positions.
-    #   Implementation:
-    #   - Test if it is a LITERAL or a Glob.  Then do a plain string
-    #   operation.
-    #   - If it's a glob, do the quadratic algorithm.
-    #   - NOTE: bash also has an optimization where it extracts the LITERAL
-    #   parts of the string, and does a prematch.  If none of them match,
-    #   then it SKIPs the quadratic algorithm.
-    #   - The real solution is to compile a glob to RE2, but I want to avoid
-    #   that dependency right now... libc regex is good for a bunch of
-    #   things.
-    # - Bash has WIDE CHAR support for this.  With wchar_t.
-    #   - All sorts of functions like xdupmbstowcs
-    #
-    # And then pat_subst() does some special cases.  Geez.
-
     assert val.tag != value_e.Undef
 
     op_kind = LookupKind(op.op_id)
-    new_val = None
 
-    # TODO: Vectorization should be factored out of all the branches.
     if op_kind == Kind.VOp1:
       #log('%s', op)
       arg_val = self.word_ev.EvalWordToString(op.arg_word, do_fnmatch=True)
       assert arg_val.tag == value_e.Str
 
       if val.tag == value_e.Str:
-        s = self._DoUnarySuffixOp(val.s, op, arg_val.s)
+        s = _DoUnarySuffixOp(val.s, op, arg_val.s)
         new_val = runtime.Str(s)
       else:  # val.tag == value_e.StrArray:
+        # ${a[@]#prefix} is VECTORIZED on arrays.  Oil should have this too.
         strs = []
         for s in val.strs:
-          strs.append(self._DoUnarySuffixOp(s, op, arg_val.s))
+          strs.append(_DoUnarySuffixOp(s, op, arg_val.s))
         new_val = runtime.StrArray(strs)
 
     else:
       raise AssertionError(op_kind)
 
-    if new_val:
-      return new_val
-    else:
-      return val
+    return new_val
 
   def _EvalDoubleQuotedPart(self, part):
     # Example of returning array:
@@ -770,16 +805,16 @@ def _EvalBracedVarSub(self, part, quoted):
 
         pat = pat_val.s
         if val.tag == value_e.Str:
-          s = self._PatSub(val.s, op, pat, replace_str)
+          s = _PatSub(val.s, op, pat, replace_str)
           val = runtime.Str(s)
         elif val.tag == value_e.StrArray:
           strs = []
           for s in val.strs:
-            strs.append(self._PatSub(s, op, pat, replace_str))
+            strs.append(_PatSub(s, op, pat, replace_str))
           val = runtime.StrArray(strs)
 
         else:
-          raise AssertionError(val.tag)
+          raise AssertionError(val.__class__.__name__)
 
       elif op.tag == suffix_op_e.Slice:
         # Either string slicing or array slicing.  However string slicing has
diff --git a/osh/word_parse.py b/osh/word_parse.py
index bc8827365f..e427705e9f 100644
--- a/osh/word_parse.py
+++ b/osh/word_parse.py
@@ -197,22 +197,26 @@ def _ReadPatSubVarOp(self, lex_mode):
         p = ast.LiteralPart(self.cur_token)
         pat.parts.append(p)
 
-    # Check for other modifiers
-    first_part = pat.parts[0]
-    if first_part.tag == word_part_e.LiteralPart:
-      lit_id = first_part.token.id
-      if lit_id == Id.Lit_Slash:
-        do_all = True
-        pat.parts.pop(0)
-      elif lit_id == Id.Lit_Percent:
-        do_prefix = True
-        pat.parts.pop(0)
-      elif lit_id == Id.Lit_Pound:
-        do_suffix = True
-        pat.parts.pop(0)
+    if len(pat.parts) == 0:
+      self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
+      return None
+    else:
+      first_part = pat.parts[0]
+      if first_part.tag == word_part_e.LiteralPart:
+        lit_id = first_part.token.id
+        if lit_id == Id.Lit_Slash:
+          do_all = True
+          pat.parts.pop(0)
+        elif lit_id == Id.Lit_Pound:
+          do_prefix = True
+          pat.parts.pop(0)
+        elif lit_id == Id.Lit_Percent:
+          do_suffix = True
+          pat.parts.pop(0)
 
     #self._Peek()
     if self.token_type == Id.Right_VarSub:
+      # e.g. ${v/a} is the same as ${v/a/}  -- empty replacement string
       return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
 
     elif self.token_type == Id.Lit_Slash:
diff --git a/osh/word_parse_test.py b/osh/word_parse_test.py
index f137536a5a..b22eda8101 100755
--- a/osh/word_parse_test.py
+++ b/osh/word_parse_test.py
@@ -198,13 +198,13 @@ def testPatSub(self):
 
     w = _assertReadWord(self, '${var/%pat/replace}')  # prefix
     op = _GetSuffixOp(self, w)
-    self.assertTrue(op.do_prefix)
+    self.assertTrue(op.do_suffix)
     self.assertUnquoted('pat', op.pat)
     self.assertUnquoted('replace', op.replace)
 
     w = _assertReadWord(self, '${var/#pat/replace}')  # suffix
     op = _GetSuffixOp(self, w)
-    self.assertTrue(op.do_suffix)
+    self.assertTrue(op.do_prefix)
     self.assertUnquoted('pat', op.pat)
     self.assertUnquoted('replace', op.replace)
 
diff --git a/spec/var-op-other.test.sh b/spec/var-op-other.test.sh
index 86a5afa575..0717f1a3df 100644
--- a/spec/var-op-other.test.sh
+++ b/spec/var-op-other.test.sh
@@ -53,6 +53,13 @@ echo ${s/?xx/_yy} ${s/%?xx/_yy}
 # N-I dash status: 2
 # N-I dash stdout-json: ""
 
+### Replace fixed strings
+s=xx_xx
+echo ${s/xx/yy} ${s//xx/yy} ${s/#xx/yy} ${s/%xx/yy}
+# stdout: yy_xx yy_yy yy_xx xx_yy
+# N-I dash status: 2
+# N-I dash stdout-json: ""
+
 ### Replace is longest match
 # If it were shortest, then you would just replace the first <html>
 s='begin <html></html> end'
@@ -91,6 +98,14 @@ echo status=$?
 # BUG bash/mksh status: 0
 # BUG bash/mksh stdout-json: "-a/b/c-\nstatus=0\n"
 
+### ${v/a} is the same as ${v/a/}  -- no replacement string
+v='aabb'
+echo ${v/a}
+echo status=$?
+# stdout-json: "abb\nstatus=0\n"
+# N-I dash stdout-json: ""
+# N-I dash status: 2
+
 ### String slice
 foo=abcdefg
 echo ${foo:1:3}
diff --git a/test/spec.sh b/test/spec.sh
index 2aadea8722..2bea6cdf31 100755
--- a/test/spec.sh
+++ b/test/spec.sh
@@ -349,12 +349,12 @@ var-op-test() {
 }
 
 var-op-other() {
-  sh-spec spec/var-op-other.test.sh --osh-failures-allowed 10 \
+  sh-spec spec/var-op-other.test.sh --osh-failures-allowed 5 \
     ${REF_SHELLS[@]} $OSH "$@"
 }
 
 var-op-strip() {
-  sh-spec spec/var-op-strip.test.sh --osh-failures-allowed 5 \
+  sh-spec spec/var-op-strip.test.sh --osh-failures-allowed 1 \
     ${REF_SHELLS[@]} $ZSH $OSH "$@"
 }