Implement Patsub ${x/pat/replace} and strip ops ${x#prefix}, etc.

There are two strategies, depending on the pattern. 1) Fixed strings use Python's string methods, e.g. startswith/endswith/replace/slice. 2) Glob patterns are converted to Python regexes. (Character classes aren't currently supported.) Then we use the regex engine for position information and greedy/non-greedy matches. Also: - Added tests. - Fix parsing. - TODO: Unicode Addresses issue #26.
oils-for-unix · Aug 22, 2017 · 8066fd7 · 8066fd7
1 parent 0af9b23
commit 8066fd7
Show file tree

Hide file tree

Showing 7 changed files with 241 additions and 133 deletions.
diff --git a/core/glob_.py b/core/glob_.py
@@ -3,6 +3,8 @@
 glob_.py
 """
 
+import re
+
 import libc
 
 from core.util import log
@@ -55,56 +57,88 @@ def GlobEscape(s):
   return escaped
 
 
-def GlobToExtendedRegex(g):
-  """Convert a glob to a libc extended regexp.
+# We need to handle glob patterns, but fnmatch doesn't give you the positions
+# of matches.  So we convert globs to regexps.
 
-  For ${s//pat*/__}.
+# There are two regex engines we can use.  Each has advantages and
+# disadvantages:
 
-  We need to use regcomp/regex because fnmatch doesn't give you the positions
-  of matches.
+# Python regex:
+# - Supports Greedy vs. Non-greedy (necessary for strip ops, but not patsub)
+# - Doesn't rely on global variables for unicode.  I think libc string
+#   functions use LOCALE?
+
+# ERE:
+# - Linear time algorithm
+# - Save code space
+# - Supports the same character classes as glob.
 
-  Why not use Python?  To avoid backtracking?  I think we should just Python
-  here.  Because we want Unicode to be consistent too.
-  
-  What other string ops are there?
 
+def GlobToExtendedRegex(g):
+  """Convert a glob to a libc extended regexp.
 
   Returns:
     A ERE string, or None if it's the pattern is a constant string rather than
     a glob.
   """
-  # NOTE: character classes are retained literally, since EREs have the same
-  # char class syntax?
+  # Could be used for ${s//pat*/__}, but NOT # ## % %%.
+  # We'll use Python everywhere for simplicity.
+  raise NotImplementedError
 
 
-def GlobToPythonRegex(g, longest=True):
+def GlobToPythonRegex(s, greedy=True):
   """Convert a glob to a libc extended regexp.
 
   Args:
-    longest: whether * should be '.*' (greedy) or '.*?' (non-greedy)
-
-  We need Python's engine for greedy and non-greedy matches.  libc doesn't have
-  that.
-
-  For string ops like ${s#'*b'}
+    greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy)
 
   NOTE: character classes aren't supported.
 
   Returns:
     A Python regex string, or None if it's the pattern is a constant string
     rather than a glob.
+
+    regex, err?
   """
-  return None
-  # TODO:
-  # - Iterate through each characater
-  # - Check for escapes
-  # - If it 
-
-  if longest:
-    pass
+  star_pat = '.*' if greedy else '.*?'
+
+  is_glob = False
+  err = None
+
+  i = 0
+  n = len(s)
+  out = []
+  while i < n:
+    c = s[i]
+    if c == '\\':  # glob escape like \* or \?
+      i += 1
+      out.append(s[i])
+    elif c == '*':
+      is_glob = True
+      out.append(star_pat)
+    elif c == '?':
+      is_glob = True
+      out.append('.')
+    # TODO: Should we enter a different state and parse these?
+    elif c == '[':
+      err = True  # TODO: better error
+      break
+    elif c == ']':
+      err = True
+    else:
+      # e.g. . -> \.
+      out.append(re.escape(c))
+
+    i += 1
+
+  if err:
+    return None, err
   else:
-    pass
-  return '^' + '$'
+    if is_glob:
+      regex = ''.join(out)
+    else:
+      regex = None
+    return regex, err
 
 
 def _GlobUnescape(s):  # used by cmd_exec

diff --git a/core/glob_test.py b/core/glob_test.py
@@ -77,12 +77,35 @@ def testPatSubRegexes(self):
     # x=~/git/oil
     # ${x//git*/X/}
 
-    # NOTE: This should be regcomp
-    r = re.compile('(^.*)git.*(.*)')
-
-    result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil')
+    # git*
+    r1 = re.compile('git.*')
+    result = r1.sub('X', '~/git/oil')
     self.assertEqual('~/X', result)
 
+    r2 = re.compile('[a-z]')
+    result = r2.sub('X', 'a-b-c')
+    self.assertEqual('X-X-X', result)
+
+    # Substitute the first one only
+    r2 = re.compile('[a-z]')
+    result = r2.sub('X', 'a-b-c', count=1)
+    self.assertEqual('X-b-c', result)
+
+  def testGlobToPythonRegex(self):
+    CASES = [
+        # glob input, (regex, err)
+        ('*.py', '.*\.py', None),
+        ('*.?', '.*\..', None),
+        ('abc', None, None),
+        ('[[:space:]]', None, True),
+    ]
+    for glob, expected_regex, expected_err in CASES:
+      regex, err = glob_.GlobToPythonRegex(glob)
+      self.assertEqual(expected_regex, regex,
+          '%s: expected %r, got %r' % (glob, expected_regex, regex))
+      self.assertEqual(expected_err, err,
+          '%s: expected %r, got %r' % (glob, expected_err, err))
+
   def testPatSubRegexesLibc(self):
     r = libc.regex_parse('^(.*)git.*(.*)')
     print(r)
@@ -94,8 +117,5 @@ def testPatSubRegexesLibc(self):
     # We have to keep advancing the string until there are no more matches.
 
 
-
-
-
 if __name__ == '__main__':
   unittest.main()