Skip to content

Commit 14ccdeb

Browse files
greezybacondpgeorge
authored andcommitted
extmod/modre: Add support for start- and endpos.
Pattern objects have two additional parameters for the ::search and ::match methods to define the starting and ending position of the subject within the string to be searched. This allows for searching a sub-string without creating a slice. However, one caveat of using the start-pos rather than a slice is that the start anchor (`^`) remains anchored to the beginning of the text. Signed-off-by: Jared Hancock <jared@greezybacon.me>
1 parent 485dac7 commit 14ccdeb

File tree

3 files changed

+114
-3
lines changed

3 files changed

+114
-3
lines changed

docs/library/re.rst

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,25 @@ Regex objects
154154
Compiled regular expression. Instances of this class are created using
155155
`re.compile()`.
156156

157-
.. method:: regex.match(string)
158-
regex.search(string)
157+
.. method:: regex.match(string, [pos, [endpos]])
158+
regex.search(string, [pos, [endpos]])
159159
regex.sub(replace, string, count=0, flags=0, /)
160160

161161
Similar to the module-level functions :meth:`match`, :meth:`search`
162162
and :meth:`sub`.
163163
Using methods is (much) more efficient if the same regex is applied to
164164
multiple strings.
165165

166+
The optional second parameter *pos* gives an index in the string where the
167+
search is to start; it defaults to ``0``. This is not completely equivalent
168+
to slicing the string; the ``'^'`` pattern character matches at the real
169+
beginning of the string and at positions just after a newline, but not
170+
necessarily at the index where the search is to start.
171+
172+
The optional parameter *endpos* limits how far the string will be searched;
173+
it will be as if the string is *endpos* characters long, so only the
174+
characters from *pos* to ``endpos - 1`` will be searched for a match.
175+
166176
.. method:: regex.split(string, max_split=-1, /)
167177

168178
Split a *string* using regex. If *max_split* is given, it specifies

extmod/modre.c

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,17 +196,40 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t
196196

197197
// Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD
198198
static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) {
199-
(void)n_args;
200199
mp_obj_re_t *self;
200+
bool was_compiled = false;
201201
if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) {
202202
self = MP_OBJ_TO_PTR(args[0]);
203+
was_compiled = true;
203204
} else {
204205
self = MP_OBJ_TO_PTR(mod_re_compile(1, args));
205206
}
206207
Subject subj;
207208
size_t len;
208209
subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len);
209210
subj.end = subj.begin + len;
211+
212+
if (was_compiled && n_args > 2) {
213+
// Arg #2 is starting-pos
214+
mp_int_t startpos = mp_obj_get_int(args[2]);
215+
if (startpos > (mp_int_t)len) {
216+
startpos = len;
217+
} else if (startpos < 0) {
218+
startpos = 0;
219+
}
220+
subj.begin += startpos;
221+
if (n_args > 3) {
222+
// Arg #3 is ending-pos
223+
mp_int_t endpos = mp_obj_get_int(args[3]);
224+
if (endpos > (mp_int_t)len) {
225+
endpos = len;
226+
} else if (endpos < startpos) {
227+
endpos = startpos;
228+
}
229+
subj.end = subj.begin_line + endpos;
230+
}
231+
}
232+
210233
int caps_num = (self->re.sub + 1) * 2;
211234
mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num);
212235
// cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char

tests/extmod/re_start_end_pos.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# test start and end pos specification
2+
3+
try:
4+
import re
5+
except ImportError:
6+
print("SKIP")
7+
raise SystemExit
8+
9+
10+
def print_groups(match):
11+
print("----")
12+
try:
13+
if match is not None:
14+
i = 0
15+
while True:
16+
print(match.group(i))
17+
i += 1
18+
except IndexError:
19+
pass
20+
21+
22+
p = re.compile(r"o")
23+
m = p.match("dog")
24+
print_groups(m)
25+
26+
m = p.match("dog", 1)
27+
print_groups(m)
28+
29+
m = p.match("dog", 2)
30+
print_groups(m)
31+
32+
# No match past end of input
33+
m = p.match("dog", 5)
34+
print_groups(m)
35+
36+
m = p.match("dog", 0, 1)
37+
print_groups(m)
38+
39+
# Caret only matches the actual beginning
40+
p = re.compile(r"^o")
41+
m = p.match("dog", 1)
42+
print_groups(m)
43+
44+
# End at beginning means searching empty string
45+
p = re.compile(r"o")
46+
m = p.match("dog", 1, 1)
47+
print_groups(m)
48+
49+
# End before the beginning doesn't match anything
50+
m = p.match("dog", 2, 1)
51+
print_groups(m)
52+
53+
# Negative starting values don't crash
54+
m = p.search("dog", -2)
55+
print_groups(m)
56+
57+
m = p.search("dog", -2, -5)
58+
print_groups(m)
59+
60+
# Search also works
61+
print("--search")
62+
63+
p = re.compile(r"o")
64+
m = p.search("dog")
65+
print_groups(m)
66+
67+
m = p.search("dog", 1)
68+
print_groups(m)
69+
70+
m = p.search("dog", 2)
71+
print_groups(m)
72+
73+
# Negative starting values don't crash
74+
m = p.search("dog", -2)
75+
print_groups(m)
76+
77+
m = p.search("dog", -2, -5)
78+
print_groups(m)

0 commit comments

Comments
 (0)