Skip to content

Commit

Permalink
[eggex] Implement value.Match() API
Browse files Browse the repository at this point in the history
We have

    m => group(i)
    m => start(i)
    m => end(i)

Which is just like:

    _group(i)
    _start(i)
    _end(i)

Next: implement pos= parameter.

And then regcomp() cache in C++.
  • Loading branch information
Andy C committed Dec 16, 2023
1 parent 69b271f commit 1124655
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 71 deletions.
63 changes: 37 additions & 26 deletions builtin/func_eggex.py
Expand Up @@ -4,29 +4,59 @@
"""
from __future__ import print_function

from _devbuild.gen.syntax_asdl import loc_t
from _devbuild.gen.value_asdl import value, value_t
from core import error
from core import state
from core import vm
from frontend import typed_args

from typing import List

M = 0 # _match() _group()
G = 0 # _match() _group()
S = 1 # _start()
E = 2 # _end()


def GetMatch(s, indices, i, to_return, blame_loc):
# type: (str, List[int], int, int, loc_t) -> value_t
num_groups = len(indices) / 2 # including group 0
if i < num_groups:
start = indices[2 * i]
if to_return == S:
return value.Int(start)

end = indices[2 * i + 1]
if to_return == E:
return value.Int(end)

if start == -1:
return value.Null
else:
return value.Str(s[start:end])
else:
if num_groups == 0:
msg = 'No regex capture groups'
else:
msg = 'Expected capture group less than %d, got %d' % (num_groups,
i)
raise error.UserError(2, msg, blame_loc)


class MatchAccess(vm._Callable):
"""
_match(0) or _match(): get the whole match _match(1) ..
_group(0) or _group() : get the whole match
_group(1) to _group(N): get a submatch
_group('month') : get group by name
_match(N): submatch
Ditto for _start() and _end()
"""

def __init__(self, mem, which_func):
def __init__(self, mem, to_return):
# type: (state.Mem, int) -> None
vm._Callable.__init__(self)
self.mem = mem
self.which_func = which_func
self.to_return = to_return

def Call(self, rd):
# type: (typed_args.Reader) -> value_t
Expand All @@ -35,27 +65,8 @@ def Call(self, rd):
i = rd.OptionalInt(default_=0)

s, indices = self.mem.GetRegexIndices()
num_groups = len(indices) / 2 # including group 0
if i < num_groups:
start = indices[2 * i]
if self.which_func == S:
return value.Int(start)

end = indices[2 * i + 1]
if self.which_func == E:
return value.Int(end)

if start == -1:
return value.Null
else:
return value.Str(s[start:end])
else:
if num_groups == 0:
msg = 'No regex capture groups'
else:
msg = 'Expected capture group less than %d, got %d' % (
num_groups, i)
raise error.UserError(2, msg, rd.LeftParenToken())

return GetMatch(s, indices, i, self.to_return, rd.LeftParenToken())


# vim: sw=4
21 changes: 8 additions & 13 deletions builtin/method_other.py
Expand Up @@ -4,6 +4,7 @@

from _devbuild.gen.value_asdl import (value, value_t)

from builtin import func_eggex
from core import state
from core import vm
from frontend import typed_args
Expand Down Expand Up @@ -32,28 +33,22 @@ def Call(self, rd):
return value.Null


# which method group() start() end()
GROUP = 0
START = 1
END = 2


class MatchAccess(vm._Callable):

def __init__(self, method):
def __init__(self, to_return):
# type: (int) -> None
self.method = method
self.to_return = to_return

def Call(self, rd):
# type: (typed_args.Reader) -> value_t

# This is guaranteed
m = rd.PosMatch()
# TODO: Support strings for named captures
i = rd.OptionalInt(default_=0)
#val = rd.PosValue()

# string name or integer
val = rd.PosValue()
rd.Done()

# TODO: look at m.indices and return a string

return value.Null
return func_eggex.GetMatch(m.s, m.indices, i, self.to_return,
rd.LeftParenToken())
29 changes: 15 additions & 14 deletions builtin/method_str.py
Expand Up @@ -7,6 +7,9 @@
from core import vm
from frontend import typed_args
from mycpp.mylib import log
from ysh import regex_translate

import libc

_ = log

Expand Down Expand Up @@ -75,21 +78,19 @@ def Call(self, rd):
"""
s => search(eggex, pos=0)
"""
string = rd.PosStr()
eggex_val = rd.PosEggex()

eggex = rd.PosEggex()
# don't confuse 'start' and 'pos'?
# Python has 2 kinds of 'pos'
# Don't confuse 'start' and 'pos'. Python has 2 kinds of 'start' in its regex API.
pos = rd.NamedInt('pos', 0)
rd.Done()

# TODO:
#
# call libc.regex_search(str ERE, int flags, str s, int pos)
#
# which should return non-empty List[int] of positions, or None
#
# - it uses the regcomp cache
# - TODO: eggex evaluation has to cache the group names, and number of
# groups

return value.Null
ere = regex_translate.AsPosixEre(eggex_val) # lazily converts to ERE

flags = 0 # TODO: translate flags
indices = libc.regex_search(ere, flags, string, 0)

if indices is None:
return value.Null

return value.Match(string, indices)
26 changes: 15 additions & 11 deletions core/shell.py
Expand Up @@ -723,20 +723,22 @@ def Main(
'upper': method_str.Upper(),
'lower': None,

# finds a substring, OR an eggex
# should it be findStr / replaceStr vs. findPat / replacePat? subst()
# finds a substring, optional position to start at
'find': None,

# Match eggex at certain position? A constant string is also useful
# for lexing.
'match': None,

# replace substring, OR an eggex
'replace': None,

# Like Python's re.search, except we put it on the string object
# It's more consistent with Str->find(substring, pos=0)
# It returns value.Match() rather than an integer
'search': method_str.Search(),

# like Python's re.match()
'leftMatch': None,

# like Python's re.fullmatch(), not sure if we really need it
'fullMatch': None,
}
methods[value_e.Dict] = {
'get': None, # doesn't raise an error
Expand Down Expand Up @@ -770,11 +772,10 @@ def Main(
'join': func_misc.Join(), # both a method and a func
}

# TODO: implement these
methods[value_e.Match] = {
'group': method_other.MatchAccess(method_other.GROUP),
'start': method_other.MatchAccess(method_other.START),
'end': method_other.MatchAccess(method_other.END),
'group': method_other.MatchAccess(func_eggex.G),
'start': method_other.MatchAccess(func_eggex.S),
'end': method_other.MatchAccess(func_eggex.E),
}

methods[value_e.IO] = {
Expand Down Expand Up @@ -812,9 +813,12 @@ def Main(
_SetGlobalFunc(mem, '_hay', hay_func)

_SetGlobalFunc(mem, 'len', func_misc.Len())
_SetGlobalFunc(mem, '_match', func_eggex.MatchAccess(mem, func_eggex.M))

# TODO: rename to group
_SetGlobalFunc(mem, '_match', func_eggex.MatchAccess(mem, func_eggex.G))
_SetGlobalFunc(mem, '_start', func_eggex.MatchAccess(mem, func_eggex.S))
_SetGlobalFunc(mem, '_end', func_eggex.MatchAccess(mem, func_eggex.E))

_SetGlobalFunc(mem, 'join', func_misc.Join())
_SetGlobalFunc(mem, 'maybe', func_misc.Maybe())
_SetGlobalFunc(mem, 'type', func_misc.Type())
Expand Down
29 changes: 22 additions & 7 deletions spec/ysh-regex.test.sh
Expand Up @@ -96,7 +96,7 @@ if [[ $x =~ ([[:digit:]]+)-([[:digit:]]+) ]] {

# THIS IS A NO-OP. The variable is SHADOWED by the special name.
# I think that's OK.
setvar BASH_REMATCH = %(reset)
setvar BASH_REMATCH = :| reset |

if (x ~ /<capture d+> '-' <capture d+>/) {
argv.py "${BASH_REMATCH[@]}"
Expand Down Expand Up @@ -195,6 +195,21 @@ start=0 end=3
start=-1 end=-1
## END

#### Str->search() method returns value.Match object

var s = 'hello spam5-eggs6-'

var m = s => search(/ <capture [a-z]+ > <capture d+> '-' /)
echo "g0 $[m => start(0)] $[m => end(0)] $[m => group(0)]"
echo "g1 $[m => start(1)] $[m => end(1)] $[m => group(1)]"
echo "g2 $[m => start(2)] $[m => end(2)] $[m => group(2)]"

## STDOUT:
g0 6 12 spam5-
g1 6 10 spam
g2 10 11 5
## END

#### Repeat {1,3} etc.
var pat = null

Expand Down Expand Up @@ -443,7 +458,7 @@ no
#### Matching escaped tab character
shopt -s ysh:all

var lines=%($'aa\tbb' $'cc\tdd')
var lines = :| $'aa\tbb' $'cc\tdd' |

var pat = / ('a' [\t] 'b') /
write pat=$pat
Expand Down Expand Up @@ -555,7 +570,7 @@ echo $pat
shopt -s ysh:all

# BUG: need C strings in array literal
var lines=%($'aa\tbb' $'cc\tdd')
var lines = :| $'aa\tbb' $'cc\tdd' |

var pat = / ('a' [\t] 'b') /
write pat=$pat
Expand Down Expand Up @@ -879,13 +894,13 @@ no way to have [^]
shopt -s ysh:all

# BUG: need C strings in array literal
var lines=%(
var lines = :|
'backslash \'
'rbracket ]'
'lbracket ['
"sq '"
'dq "'
)
'dq ""'
|

# Weird GNU quirk: ] has to come first!
# []abc] works. But [abc\]] does NOT work. Stupid rule!
Expand All @@ -899,7 +914,7 @@ pat=[]'"\\]
backslash \
rbracket ]
sq '
dq "
dq ""
## END

#### Matching literal hyphen in character classes
Expand Down

0 comments on commit 1124655

Please sign in to comment.