Permalink
Browse files

Implement Patsub ${x/pat/replace} and strip ops ${x#prefix}, etc.

There are two strategies, depending on the pattern.

1) Fixed strings use Python's string methods, e.g.
startswith/endswith/replace/slice.

2) Glob patterns are converted to Python regexes.  (Character classes
aren't currently supported.)

Then we use the regex engine for position information and
greedy/non-greedy matches.

Also:

- Added tests.
- Fix parsing.
- TODO: Unicode

Addresses issue #26.
  • Loading branch information...
Andy Chu Andy Chu
Andy Chu authored and Andy Chu committed Aug 14, 2017
1 parent 0af9b23 commit 8066fd72df83396d9301c2455575538ebd936ad2
Showing with 241 additions and 133 deletions.
  1. +62 −28 core/glob_.py
  2. +27 −7 core/glob_test.py
  3. +116 −81 core/word_eval.py
  4. +17 −13 osh/word_parse.py
  5. +2 −2 osh/word_parse_test.py
  6. +15 −0 spec/var-op-other.test.sh
  7. +2 −2 test/spec.sh
View
@@ -3,6 +3,8 @@
glob_.py
"""
import re
import libc
from core.util import log
@@ -55,56 +57,88 @@ def GlobEscape(s):
return escaped
def GlobToExtendedRegex(g):
"""Convert a glob to a libc extended regexp.
# We need to handle glob patterns, but fnmatch doesn't give you the positions
# of matches. So we convert globs to regexps.
For ${s//pat*/__}.
# There are two regex engines we can use. Each has advantages and
# disadvantages:
We need to use regcomp/regex because fnmatch doesn't give you the positions
of matches.
# Python regex:
# - Supports Greedy vs. Non-greedy (necessary for strip ops, but not patsub)
# - Doesn't rely on global variables for unicode. I think libc string
# functions use LOCALE?
# ERE:
# - Linear time algorithm
# - Save code space
# - Supports the same character classes as glob.
Why not use Python? To avoid backtracking? I think we should just Python
here. Because we want Unicode to be consistent too.
What other string ops are there?
def GlobToExtendedRegex(g):
"""Convert a glob to a libc extended regexp.
Returns:
A ERE string, or None if it's the pattern is a constant string rather than
a glob.
"""
# NOTE: character classes are retained literally, since EREs have the same
# char class syntax?
# Could be used for ${s//pat*/__}, but NOT # ## % %%.
# We'll use Python everywhere for simplicity.
raise NotImplementedError
def GlobToPythonRegex(g, longest=True):
def GlobToPythonRegex(s, greedy=True):
"""Convert a glob to a libc extended regexp.
Args:
longest: whether * should be '.*' (greedy) or '.*?' (non-greedy)
We need Python's engine for greedy and non-greedy matches. libc doesn't have
that.
For string ops like ${s#'*b'}
greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy)
NOTE: character classes aren't supported.
Returns:
A Python regex string, or None if it's the pattern is a constant string
rather than a glob.
regex, err?
"""
return None
# TODO:
# - Iterate through each characater
# - Check for escapes
# - If it
if longest:
pass
star_pat = '.*' if greedy else '.*?'
is_glob = False
err = None
i = 0
n = len(s)
out = []
while i < n:
c = s[i]
if c == '\\': # glob escape like \* or \?
i += 1
out.append(s[i])
elif c == '*':
is_glob = True
out.append(star_pat)
elif c == '?':
is_glob = True
out.append('.')
# TODO: Should we enter a different state and parse these?
elif c == '[':
err = True # TODO: better error
break
elif c == ']':
err = True
else:
# e.g. . -> \.
out.append(re.escape(c))
i += 1
if err:
return None, err
else:
pass
return '^' + '$'
if is_glob:
regex = ''.join(out)
else:
regex = None
return regex, err
def _GlobUnescape(s): # used by cmd_exec
View
@@ -77,12 +77,35 @@ def testPatSubRegexes(self):
# x=~/git/oil
# ${x//git*/X/}
# NOTE: This should be regcomp
r = re.compile('(^.*)git.*(.*)')
result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil')
# git*
r1 = re.compile('git.*')
result = r1.sub('X', '~/git/oil')
self.assertEqual('~/X', result)
r2 = re.compile('[a-z]')
result = r2.sub('X', 'a-b-c')
self.assertEqual('X-X-X', result)
# Substitute the first one only
r2 = re.compile('[a-z]')
result = r2.sub('X', 'a-b-c', count=1)
self.assertEqual('X-b-c', result)
def testGlobToPythonRegex(self):
CASES = [
# glob input, (regex, err)
('*.py', '.*\.py', None),
('*.?', '.*\..', None),
('abc', None, None),
('[[:space:]]', None, True),
]
for glob, expected_regex, expected_err in CASES:
regex, err = glob_.GlobToPythonRegex(glob)
self.assertEqual(expected_regex, regex,
'%s: expected %r, got %r' % (glob, expected_regex, regex))
self.assertEqual(expected_err, err,
'%s: expected %r, got %r' % (glob, expected_err, err))
def testPatSubRegexesLibc(self):
r = libc.regex_parse('^(.*)git.*(.*)')
print(r)
@@ -94,8 +117,5 @@ def testPatSubRegexesLibc(self):
# We have to keep advancing the string until there are no more matches.
if __name__ == '__main__':
unittest.main()
Oops, something went wrong.

0 comments on commit 8066fd7

Please sign in to comment.