Permalink
Browse files

Rewrite ${//} with the libc regexec() API.

A new pylibc primitive: first_group_match() is used to implement the
four variants of ${//}.

We convert to ERE syntax instead of Python syntax.

- Make note of bugs in this conversion!  (still need to fix)
- Remove dependencies on the 're' module!  Yay.

Unrelated:
- Add a URL/comment to OPy's symbols.py module.
- Lint script functions
  • Loading branch information...
Andy Chu
Andy Chu committed May 21, 2018
1 parent 62f4872 commit fc294430832dec4f49b9422961fb4fe2d5bf02e6
Showing with 186 additions and 60 deletions.
  1. +1 −1 build/dev.sh
  2. +23 −29 core/glob_.py
  3. +14 −3 core/glob_test.py
  4. +49 −12 core/libstr.py
  5. +23 −0 core/libstr_test.py
  6. +46 −1 native/libc.c
  7. +15 −13 native/libc_test.py
  8. +9 −1 opy/compiler2/symbols.py
  9. +6 −0 test/lint.sh
View
@@ -74,7 +74,7 @@ py-ext() {
pylibc() {
py-ext libc build/setup.py
PYTHONPATH=. native/libc_test.py
PYTHONPATH=. native/libc_test.py "$@"
}
fastlex() {
View
@@ -3,8 +3,6 @@
glob_.py
"""
import re
try:
import libc
except ImportError:
@@ -64,60 +62,56 @@ def GlobEscape(s):
# We need to handle glob patterns, but fnmatch doesn't give you the positions
# of matches. So we convert globs to regexps.
# TODO: Use this for ${s//pat*/__}
# NOTE: Is [!abc] negation rather than [^abc] ?
# What about unicode? Do we have to set any global variables.
# Problems:
# - What about unicode? Do we have to set any global variables? We want it to
# always use utf-8?
# - Character class for glob is different than char class for regex? According
# to the standard, anyway.
# - Honestly I would like a more principled parser for globs! Can re2c do
# better here?
def GlobToExtendedRegex(g):
def GlobToExtendedRegex(glob_pat):
"""Convert a glob to a libc extended regexp.
Returns:
A ERE string, or None if it's the pattern is a constant string rather than
a glob.
"""
raise NotImplementedError
def GlobToPythonRegex(s):
"""Convert a glob to a libc extended regexp.
Args:
greedy: whether * should be '.*' (greedy) or '.*?' (non-greedy)
NOTE: character classes aren't supported.
Returns:
A Python regex string, or None if it's the pattern is a constant string
rather than a glob.
regex, err?
"""
is_glob = False
err = None
i = 0
n = len(s)
n = len(glob_pat)
out = []
while i < n:
c = s[i]
c = glob_pat[i]
if c == '\\': # glob escape like \* or \?
# BUG: This isn't correct because \* is escaping a glob character, but
# then it's also a regex metacharacter. We should really parse the glob
# into a symbolic form first, not do text->text conversion.
# Hard test case: \** as a glob -> \*.* as a regex.
i += 1
out.append(s[i])
out.append(glob_pat[i])
elif c == '*':
is_glob = True
out.append('.*')
elif c == '?':
is_glob = True
out.append('.')
# TODO: Should we enter a different state and parse these?
# TODO: Enter a different state and parse character classes
# NOTE: Is [!abc] negation rather than [^abc] ?
elif c == '[':
err = True # TODO: better error
break
elif c == ']':
err = True
# Escape a single character for extended regex literals.""
# https://www.gnu.org/software/findutils/manual/html_node/find_html/posix_002dextended-regular-expression-syntax.html
elif c in '.|^$()+': # Already handled \ * ? []
out.append('\\' + c)
else:
# e.g. . -> \.
out.append(re.escape(c))
out.append(c)
i += 1
View
@@ -92,16 +92,27 @@ def testPatSubRegexes(self):
result = r2.sub('X', 'a-b-c', count=1)
self.assertEqual('X-b-c', result)
def testGlobToPythonRegex(self):
def testGlobToExtendedRegex(self):
CASES = [
# glob input, (regex, err)
('*.py', '.*\.py', None),
('*.?', '.*\..', None),
('abc', None, None),
('<*>', '<.*>', None),
#('\\*', '\\*', None), # not a glob, a string
# Hard case: a literal * and then a glob
#('\\**', '\\**', None),
#('c:\\foo', 'c:\\\\foo', None),
('abc', None, None), # not a glob
# TODO: These should be parsed
('[[:space:]]', None, True),
('[abc]', None, True),
('[abc\[]', None, True),
]
for glob, expected_regex, expected_err in CASES:
regex, err = glob_.GlobToPythonRegex(glob)
regex, err = glob_.GlobToExtendedRegex(glob)
self.assertEqual(expected_regex, regex,
'%s: expected %r, got %r' % (glob, expected_regex, regex))
self.assertEqual(expected_err, err,
View
@@ -13,8 +13,6 @@
var y = x -> sub( g/a*/, 'b', :ALL)
"""
import re
import libc
from osh.meta import Id
@@ -121,14 +119,48 @@ def DoUnarySuffixOp(s, op, arg):
return s
def _AllMatchPositions(s, regex):
"""Returns a list of all (start, end) match positions of the regex against s.
(If there are no matches, it returns the empty list.)
"""
matches = []
pos = 0
while True:
m = libc.regex_first_group_match(regex, s, pos)
if m is None:
break
matches.append(m)
start, end = m
log('m = %r, %r' % (start, end))
pos = end # advance position
return matches
def _PatSubAll(s, regex, replace_str):
parts = []
prev_end = 0
for start, end in _AllMatchPositions(s, regex):
parts.append(s[prev_end:start])
parts.append(replace_str)
prev_end = end
parts.append(s[prev_end:])
return ''.join(parts)
# TODO: For patsub of arrays, it would be worth it to CACHE the constant part
# of this computation. Turn this into a class, which translates and regcomp()s
# the regex exactly once.
def PatSub(s, op, pat, replace_str):
"""Helper for ${x/pat/replace}."""
#log('PAT %r REPLACE %r', pat, replace_str)
py_regex, err = glob_.GlobToPythonRegex(pat)
regex, err = glob_.GlobToExtendedRegex(pat)
if err:
e_die("Can't convert glob to regex: %r", pat)
if py_regex is None: # Simple/fast path for fixed strings
if regex is None: # Simple/fast path for fixed strings
if op.do_all:
return s.replace(pat, replace_str)
elif op.do_prefix:
@@ -147,14 +179,19 @@ def PatSub(s, op, pat, replace_str):
return s.replace(pat, replace_str, 1) # just the first one
else:
count = 1 # replace first occurrence only
regex = '(%s)' % regex # make it a group
if op.do_all:
count = 0 # replace all
elif op.do_prefix:
py_regex = '^' + py_regex
elif op.do_suffix:
py_regex = py_regex + '$'
return _PatSubAll(s, regex, replace_str) # loop over matches
pat_re = re.compile(py_regex)
return pat_re.sub(replace_str, s, count)
if op.do_prefix:
regex = '^' + regex
elif op.do_suffix:
regex = regex + '$'
m = libc.regex_first_group_match(regex, s, 0)
log('regex = %r, s = %r, match = %r', regex, s, m)
if m is None:
return s
start, end = m
return s[:start] + replace_str + s[end:]
View
@@ -40,6 +40,29 @@ def testUnarySuffixOpDemo(self):
print('%d test %06r return %06r' % (i, s[i:], s[:i]))
print()
def testPatSubAllMatches(self):
s = 'oXooXoooX'
# Match positions
self.assertEqual(
[(1, 3), (4, 6)],
libstr._AllMatchPositions(s, '(X.)'))
# No match
self.assertEqual(
[],
libstr._AllMatchPositions(s, '(z)'))
# Replacement
self.assertEqual(
'o_o_ooX',
libstr._PatSubAll(s, '(X.)', '_'))
# Replacement with no match
self.assertEqual(
s,
libstr._PatSubAll(s, '(z)', '_'))
if __name__ == '__main__':
unittest.main()
View
@@ -231,7 +231,6 @@ func_regex_match(PyObject *self, PyObject *args) {
fprintf(stderr, "Invalid regex at runtime\n");
return PyLong_FromLong(-1);
}
//regcomp(&pat, pattern, REG_EXTENDED);
int ret;
// must match at pos 0
@@ -269,6 +268,49 @@ func_regex_match(PyObject *self, PyObject *args) {
}
}
// For ${//}, the number of groups is always 1, so we want 2 match position
// results -- the whole regex (which we ignore), and then first group.
//
// For [[ =~ ]], do we need to count how many matches the user gave?
#define NMATCH 2
static PyObject *
func_regex_first_group_match(PyObject *self, PyObject *args) {
const char* pattern;
const char* str;
int pos;
if (!PyArg_ParseTuple(args, "ssi", &pattern, &str, &pos)) {
return NULL;
}
regex_t pat;
regmatch_t m[NMATCH];
// Could have been checked by regex_parse for [[ =~ ]], but not for glob
// patterns like ${foo/x*/y}.
if (regcomp(&pat, pattern, REG_EXTENDED) != 0) {
fprintf(stderr, "Invalid regex at runtime\n");
return PyLong_FromLong(-1);
}
debug("first_group_match pat %s str %s pos %d", pattern, str, pos);
// Match at offset 'pos'
int result = regexec(&pat, str + pos, NMATCH, m, 0 /*flags*/);
regfree(&pat);
if (result != 0) {
Py_RETURN_NONE; // no match
}
// Assume there is a match
regoff_t start = m[1].rm_so;
regoff_t end = m[1].rm_eo;
return Py_BuildValue("(i,i)", pos + start, pos + end);
}
static PyMethodDef methods[] = {
{"fnmatch", func_fnmatch, METH_VARARGS,
"Return whether a string matches a pattern."},
@@ -280,6 +322,9 @@ static PyMethodDef methods[] = {
"Compile a regex in ERE syntax, returning whether it is valid"},
{"regex_match", func_regex_match, METH_VARARGS,
"Match regex against a string, returning a list of matches"},
{"regex_first_group_match", func_regex_first_group_match, METH_VARARGS,
"If matching, return the start and end position of the first group. "
"Otherwise None."},
{NULL, NULL},
};
View
@@ -75,19 +75,21 @@ def testRegex(self):
# Error.
print(libc.regex_match(r'*', 'abcd'))
def testRegexReplace(self):
cases = [
(r'.\.py', 'X', 'foo.py', False, 'foX'),
(r'^\.py', 'X', 'foo.py', False, 'foo.py'), # Anchored left
(r'foo$', 'X', 'foo.py', False, 'foo.py'), # Anchored Right
(r'o', 'X', 'foo.py', False, 'fXo.py'), # replace all
(r'o', 'X', 'foo.py', True, 'fXX.py'), # replace all
]
return
for pat, replace, s, do_all, expected in cases:
actual = libc.regex_replace(pat, replace, s, do_all)
self.assertEqual(expected, actual)
def testRegexFirstGroupMatch(self):
s='oXooXoooXoX'
self.assertEqual(
(1, 3),
libc.regex_first_group_match('(X.)', s, 0))
# Match from position 3
self.assertEqual(
(4, 6),
libc.regex_first_group_match('(X.)', s, 3))
# Match from position 3
self.assertEqual(
(8, 10),
libc.regex_first_group_match('(X.)', s, 6))
if __name__ == '__main__':
View
@@ -1,4 +1,12 @@
"""Module symbol-table generator"""
"""Statically classify symbols by scope, for code generation.
Great article about CPython's symbol tables, which are slightly different:
https://eli.thegreenplace.net/2010/09/20/python-internals-symbol-tables-part-2
In particular, see footnote 6: Why does CPython's algorithm have 2 passes,
while there is only one pass here?
"""
from __future__ import print_function
from . import ast
View
@@ -11,6 +11,12 @@ set -o errexit
source build/common.sh
# ubuntu names
install-flake8() {
sudo apt install python-pip
pip install flake8
}
get-cpplint() {
mkdir -p _tmp
wget --directory _tmp \

0 comments on commit fc29443

Please sign in to comment.