Skip to content

Commit

Permalink
[eggex] Collect list of named captures
Browse files Browse the repository at this point in the history
It seems we can do a straight tree traversal, based on the order of (
for the group.  We will "invert" the list to a map of {name: index}

TODO: still needs testing.
  • Loading branch information
Andy Chu committed Dec 12, 2023
1 parent 94cbddf commit 6b6319e
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 17 deletions.
1 change: 1 addition & 0 deletions builtin/method_other.py
Expand Up @@ -37,6 +37,7 @@ def Call(self, rd):
START = 1
END = 2


class MatchAccess(vm._Callable):

def __init__(self, method):
Expand Down
6 changes: 5 additions & 1 deletion core/value.asdl
Expand Up @@ -9,6 +9,7 @@ module value
DoubleQuoted
re proc_sig
LiteralBlock Func
NameType
}

use core runtime {
Expand Down Expand Up @@ -72,7 +73,10 @@ module value
# callable, in separate namespaces: Func, BoundFunc, Proc

# / d+; ignorecase / -> '[[:digit:]]+' REG_ICASE
| Eggex(re expr, List[str] flags, str? as_ere, int ere_flags)
| Eggex(re expr, List[str] flags, str? as_ere, int ere_flags,
# inner ? is because some groups are not named
# outer ? because it's not set until ERE translation is done
List[NameType?]? name_types)

# indices has G groups and N matches. -1 values indicate no match.
# We flatten it to reduce allocations, and because group() start() end()
Expand Down
22 changes: 20 additions & 2 deletions cpp/libc_test.cc
Expand Up @@ -187,7 +187,7 @@ TEST regex_lexer() {
PASS();
}

TEST regex_nested_groups() {
TEST regex_repeat_with_capture() {
const char* lexer = "(([a-z]+)([0-9]+)-)*((A+)|(Z+))*";
FindAll(lexer, "a0-b1-c2-AAZZZA");
// Groups are weird
Expand All @@ -206,6 +206,22 @@ TEST regex_nested_groups() {
PASS();
}

// Disallow this in eggex, as well as the above
TEST regex_nested_capture() {
const char* lexer = "(([a-z]+)([0-9]+))";
FindAll(lexer, "a0");
PASS();
}

// I think we allow this in eggex
TEST regex_alt_with_capture() {
const char* lexer = "([a-z]+)|([0-9]+)(-)";
FindAll(lexer, "x-");
FindAll(lexer, "7-");
PASS();
}


GREATEST_MAIN_DEFS();

int main(int argc, char** argv) {
Expand All @@ -222,7 +238,9 @@ int main(int argc, char** argv) {
RUN_TEST(regex_unanchored);
RUN_TEST(regex_caret);
RUN_TEST(regex_lexer);
RUN_TEST(regex_nested_groups);
RUN_TEST(regex_repeat_with_capture);
RUN_TEST(regex_alt_with_capture);
RUN_TEST(regex_nested_capture);

gHeap.CleanProcessExit();

Expand Down
10 changes: 4 additions & 6 deletions ysh/expr_eval.py
Expand Up @@ -1321,8 +1321,8 @@ def _EvalRegex(self, node):

elif case(value_e.Eggex):
val = cast(value.Eggex, UP_val)
# Note: we only splice the regex, and ignore flags.
# Should we warn about this?
# TODO: warn about flags that don't match
# This check will be transitive
to_splice = val.expr

else:
Expand Down Expand Up @@ -1350,11 +1350,9 @@ def EvalEggex(self, node):
# - check for incompatible flags, like i
# - or can the root override flags? Probably not
# - check for named captures not at the top level
new_node = self._EvalRegex(node.regex)

spliced = self._EvalRegex(node.regex)
flags = [lexer.TokenVal(tok) for tok in node.flags]

return value.Eggex(new_node, flags, None, 0)
return value.Eggex(spliced, flags, None, 0, None)


# vim: sw=4
40 changes: 32 additions & 8 deletions ysh/regex_translate.py
Expand Up @@ -13,6 +13,7 @@
re_e,
re_repeat,
re_repeat_e,
NameType,
)
from _devbuild.gen.id_kind_asdl import Id
from _devbuild.gen.value_asdl import value
Expand Down Expand Up @@ -151,8 +152,8 @@ def _CharClassTermToEre(term, parts, special_char_flags):
raise AssertionError(term)


def _AsPosixEre(node, parts):
# type: (re_t, List[str]) -> None
def _AsPosixEre(node, parts, name_types):
# type: (re_t, List[str], List[NameType]) -> None
"""Translate an Oil regex to a POSIX ERE.
Appends to a list of parts that you have to join.
Expand Down Expand Up @@ -188,15 +189,15 @@ def _AsPosixEre(node, parts):
if tag == re_e.Seq:
node = cast(re.Seq, UP_node)
for c in node.children:
_AsPosixEre(c, parts)
_AsPosixEre(c, parts, name_types)
return

if tag == re_e.Alt:
node = cast(re.Alt, UP_node)
for i, c in enumerate(node.children):
if i != 0:
parts.append('|')
_AsPosixEre(c, parts)
_AsPosixEre(c, parts, name_types)
return

if tag == re_e.Repeat:
Expand All @@ -211,7 +212,7 @@ def _AsPosixEre(node, parts):
"POSIX EREs don't have groups without capture, so this node "
"needs () around it.", child.blame_tok)

_AsPosixEre(node.child, parts)
_AsPosixEre(node.child, parts, name_types)
op = node.op
op_tag = op.tag()
UP_op = op
Expand Down Expand Up @@ -244,10 +245,25 @@ def _AsPosixEre(node, parts):
raise NotImplementedError(op_tag)

# Special case for familiarity: () is acceptable as a group in ERE
if tag in (re_e.Group, re_e.Capture):
if tag == re_e.Group:
node = cast(re.Group, UP_node)

# placeholder so we know this group is numbered, but not named
name_types.append(None)

parts.append('(')
_AsPosixEre(node.child, parts, name_types)
parts.append(')')
return

if tag == re_e.Capture:
node = cast(re.Capture, UP_node)

# Collect in order of ( appearance
name_types.append(node.name_type)

parts.append('(')
_AsPosixEre(node.child, parts)
_AsPosixEre(node.child, parts, name_types)
parts.append(')')
return

Expand Down Expand Up @@ -319,6 +335,14 @@ def AsPosixEre(eggex):
return eggex.as_ere

parts = [] # type: List[str]
_AsPosixEre(eggex.expr, parts)
name_types = [] # type: List[NameType]

_AsPosixEre(eggex.expr, parts, name_types)

#names = [n.name.tval for n in name_types]
#log('names %s', names)

eggex.as_ere = ''.join(parts)
eggex.name_types = name_types

return eggex.as_ere

0 comments on commit 6b6319e

Please sign in to comment.