Skip to content

Commit

Permalink
Implement Patsub ${x/pat/replace} and strip ops ${x#prefix}, etc.
Browse files Browse the repository at this point in the history
There are two strategies, depending on the pattern.

1) Fixed strings use Python's string methods, e.g.
startswith/endswith/replace/slice.

2) Glob patterns are converted to Python regexes.  (Character classes
aren't currently supported.)

Then we use the regex engine for position information and
greedy/non-greedy matches.

Also:

- Added tests.
- Fix parsing.
- TODO: Unicode

Addresses issue #26.
  • Loading branch information
Andy Chu authored and Andy Chu committed Aug 22, 2017
1 parent 0af9b23 commit 8066fd7
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 133 deletions.
90 changes: 62 additions & 28 deletions core/glob_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
glob_.py
"""

import re

import libc

from core.util import log
Expand Down Expand Up @@ -55,56 +57,88 @@ def GlobEscape(s):
return escaped


def GlobToExtendedRegex(g):
"""Convert a glob to a libc extended regexp.
# We need to handle glob patterns, but fnmatch doesn't give you the positions
# of matches. So we convert globs to regexps.

For ${s//pat*/__}.
# There are two regex engines we can use. Each has advantages and
# disadvantages:

We need to use regcomp/regex because fnmatch doesn't give you the positions
of matches.
# Python regex:
# - Supports Greedy vs. Non-greedy (necessary for strip ops, but not patsub)
# - Doesn't rely on global variables for unicode. I think libc string
# functions use LOCALE?

# ERE:
# - Linear time algorithm
# - Save code space
# - Supports the same character classes as glob.

Why not use Python? To avoid backtracking? I think we should just Python
here. Because we want Unicode to be consistent too.
What other string ops are there?

def GlobToExtendedRegex(g):
"""Convert a glob to a libc extended regexp.
Returns:
A ERE string, or None if it's the pattern is a constant string rather than
a glob.
"""
# NOTE: character classes are retained literally, since EREs have the same
# char class syntax?
# Could be used for ${s//pat*/__}, but NOT # ## % %%.
# We'll use Python everywhere for simplicity.
raise NotImplementedError


def GlobToPythonRegex(g, longest=True):
def GlobToPythonRegex(s, greedy=True):
"""Convert a glob to a libc extended regexp.
Args:
longest: whether * should be '.*' (greedy) or '.*?' (non-greedy)
We need Python's engine for greedy and non-greedy matches. libc doesn't have
that.
For string ops like ${s#'*b'}
greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy)
NOTE: character classes aren't supported.
Returns:
A Python regex string, or None if it's the pattern is a constant string
rather than a glob.
regex, err?
"""
return None
# TODO:
# - Iterate through each characater
# - Check for escapes
# - If it

if longest:
pass
star_pat = '.*' if greedy else '.*?'

is_glob = False
err = None

i = 0
n = len(s)
out = []
while i < n:
c = s[i]
if c == '\\': # glob escape like \* or \?
i += 1
out.append(s[i])
elif c == '*':
is_glob = True
out.append(star_pat)
elif c == '?':
is_glob = True
out.append('.')
# TODO: Should we enter a different state and parse these?
elif c == '[':
err = True # TODO: better error
break
elif c == ']':
err = True
else:
# e.g. . -> \.
out.append(re.escape(c))

i += 1

if err:
return None, err
else:
pass
return '^' + '$'
if is_glob:
regex = ''.join(out)
else:
regex = None
return regex, err


def _GlobUnescape(s): # used by cmd_exec
Expand Down
34 changes: 27 additions & 7 deletions core/glob_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,35 @@ def testPatSubRegexes(self):
# x=~/git/oil
# ${x//git*/X/}

# NOTE: This should be regcomp
r = re.compile('(^.*)git.*(.*)')

result = r.sub(r'\1' + 'X' + r'\2', '~/git/oil')
# git*
r1 = re.compile('git.*')
result = r1.sub('X', '~/git/oil')
self.assertEqual('~/X', result)

r2 = re.compile('[a-z]')
result = r2.sub('X', 'a-b-c')
self.assertEqual('X-X-X', result)

# Substitute the first one only
r2 = re.compile('[a-z]')
result = r2.sub('X', 'a-b-c', count=1)
self.assertEqual('X-b-c', result)

def testGlobToPythonRegex(self):
CASES = [
# glob input, (regex, err)
('*.py', '.*\.py', None),
('*.?', '.*\..', None),
('abc', None, None),
('[[:space:]]', None, True),
]
for glob, expected_regex, expected_err in CASES:
regex, err = glob_.GlobToPythonRegex(glob)
self.assertEqual(expected_regex, regex,
'%s: expected %r, got %r' % (glob, expected_regex, regex))
self.assertEqual(expected_err, err,
'%s: expected %r, got %r' % (glob, expected_err, err))

def testPatSubRegexesLibc(self):
r = libc.regex_parse('^(.*)git.*(.*)')
print(r)
Expand All @@ -94,8 +117,5 @@ def testPatSubRegexesLibc(self):
# We have to keep advancing the string until there are no more matches.





if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 8066fd7

Please sign in to comment.