From 029d04909bccd79469f77e02006ebb6dda32a3e3 Mon Sep 17 00:00:00 2001 From: nitely Date: Fri, 25 Aug 2023 05:15:04 -0300 Subject: [PATCH 1/9] ORC speedups --- bench/bench.nim | 240 ++++++++++++++++++++-------------------- bench/bench2.nim | 23 ++++ src/regex/nfa.nim | 44 ++++---- src/regex/nfamatch2.nim | 54 +++++---- 4 files changed, 198 insertions(+), 163 deletions(-) create mode 100644 bench/bench2.nim diff --git a/bench/bench.nim b/bench/bench.nim index 09967d9..8d48ec1 100644 --- a/bench/bench.nim +++ b/bench/bench.nim @@ -29,159 +29,163 @@ benchRelative(regex_sol, m): discard regex.match(text, pattern4, m2) doNotOptimizeAway(m2) -benchRelative(regex_macro_sol, m): - var d: bool - for i in 0 ..< m: - regex.match text, regex.rex"\w*sol\w*": - d = true - doNotOptimizeAway(d) +when isMainModule: + runBenchmarks() -var dummyTextNums = """650-253-0001""" +when false: + benchRelative(regex_macro_sol, m): + var d: bool + for i in 0 ..< m: + regex.match text, regex.rex"\w*sol\w*": + d = true + doNotOptimizeAway(d) -var pattern_nums = re.re"^[0-9]+-[0-9]+-[0-9]+$" + var dummyTextNums = """650-253-0001""" -bench(re_nums, m): - var d: bool - for i in 0 ..< m: - d = re.match(dummyTextNums, pattern_nums) - doNotOptimizeAway(d) + var pattern_nums = re.re"^[0-9]+-[0-9]+-[0-9]+$" -const n_pattern_nums = regex.re2"[0-9]+-[0-9]+-[0-9]+" - -benchRelative(regex_nums, m): - var m2: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.match(dummyTextNums, n_pattern_nums, m2) - doNotOptimizeAway(m2) + bench(re_nums, m): + var d: bool + for i in 0 ..< m: + d = re.match(dummyTextNums, pattern_nums) + doNotOptimizeAway(d) -benchRelative(regex_macro_nums, m): - var d: bool - for i in 0 ..< m: - regex.match text, regex.rex"[0-9]+-[0-9]+-[0-9]+": - d = true - doNotOptimizeAway(d) + const n_pattern_nums = regex.re2"[0-9]+-[0-9]+-[0-9]+" -var pattern_nums2 = re.re"^[0-9]+..*$" + benchRelative(regex_nums, m): + var m2: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.match(dummyTextNums, n_pattern_nums, m2) + doNotOptimizeAway(m2) -bench(re_nums2, m): - var d: bool - for i in 0 ..< m: - d = re.match(dummyTextNums, pattern_nums2) - doNotOptimizeAway(d) + benchRelative(regex_macro_nums, m): + var d: bool + for i in 0 ..< m: + regex.match text, regex.rex"[0-9]+-[0-9]+-[0-9]+": + d = true + doNotOptimizeAway(d) -const n_pattern_nums2 = regex.re2"[0-9]+..*" + var pattern_nums2 = re.re"^[0-9]+..*$" -benchRelative(regex_nums2, m): - var m3: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.match(dummyTextNums, n_pattern_nums2, m3) - doNotOptimizeAway(m3) + bench(re_nums2, m): + var d: bool + for i in 0 ..< m: + d = re.match(dummyTextNums, pattern_nums2) + doNotOptimizeAway(d) -benchRelative(regex_macro_nums2, m): - var d: bool - for i in 0 ..< m: - regex.match text, regex.rex"[0-9]+..*": - d = true - doNotOptimizeAway(d) + const n_pattern_nums2 = regex.re2"[0-9]+..*" -when false: # XXX remove - var lits_find_re = re.re"do|re|mi|fa|sol" + benchRelative(regex_nums2, m): + var m3: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.match(dummyTextNums, n_pattern_nums2, m3) + doNotOptimizeAway(m3) - bench(re_lits_find, m): - var d: int + benchRelative(regex_macro_nums2, m): + var d: bool for i in 0 ..< m: - d = re.find(text, lits_find_re) + regex.match text, regex.rex"[0-9]+..*": + d = true doNotOptimizeAway(d) - const lits_find = regex.re2"do|re|mi|fa|sol" + when false: # XXX remove + var lits_find_re = re.re"do|re|mi|fa|sol" - benchRelative(regex_lits_find, m): - var m2: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.find(text, lits_find, m2) - doNotOptimizeAway(m2) + bench(re_lits_find, m): + var d: int + for i in 0 ..< m: + d = re.find(text, lits_find_re) + doNotOptimizeAway(d) -const bench_text = staticRead("input-text.txt") + const lits_find = regex.re2"do|re|mi|fa|sol" -var email_find_all_re = re.re"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" + benchRelative(regex_lits_find, m): + var m2: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.find(text, lits_find, m2) + doNotOptimizeAway(m2) -bench(re_email_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, email_find_all_re): - d += 1 - doAssert d == 92 - doNotOptimizeAway(d) + const bench_text = staticRead("input-text.txt") -const email_find_all = regex.re2"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" + var email_find_all_re = re.re"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" -benchRelative(regex_email_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in regex.findAll(bench_text, email_find_all): - d += 1 - doAssert d == 92 - doNotOptimizeAway(d) + bench(re_email_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, email_find_all_re): + d += 1 + doAssert d == 92 + doNotOptimizeAway(d) -var uri_find_all_re = re.re"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" + const email_find_all = regex.re2"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" -bench(re_uri_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, uri_find_all_re): - d += 1 - doAssert d == 5301 - doNotOptimizeAway(d) + benchRelative(regex_email_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in regex.findAll(bench_text, email_find_all): + d += 1 + doAssert d == 92 + doNotOptimizeAway(d) -const uri_find_all = regex.re2"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" + var uri_find_all_re = re.re"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" -benchRelative(regex_uri_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in regex.findAll(bench_text, uri_find_all): - d += 1 - doAssert d == 5301 - doNotOptimizeAway(d) + bench(re_uri_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, uri_find_all_re): + d += 1 + doAssert d == 5301 + doNotOptimizeAway(d) -var ip_find_all_re = re.re"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" + const uri_find_all = regex.re2"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" -bench(re_ip_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, ip_find_all_re): - d += 1 - doAssert d == 5 - doNotOptimizeAway(d) + benchRelative(regex_uri_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in regex.findAll(bench_text, uri_find_all): + d += 1 + doAssert d == 5301 + doNotOptimizeAway(d) -const ip_find_all = regex.re2"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" + var ip_find_all_re = re.re"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" -benchRelative(regex_ip_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in regex.findAll(bench_text, ip_find_all): - d += 1 - doAssert d == 5 - doNotOptimizeAway(d) + bench(re_ip_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, ip_find_all_re): + d += 1 + doAssert d == 5 + doNotOptimizeAway(d) -when true: - bench(runes, m): + const ip_find_all = regex.re2"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" + + benchRelative(regex_ip_find_all, m): var d = 0 for i in 0 ..< m: - for _ in bench_text.runes: + for _ in regex.findAll(bench_text, ip_find_all): d += 1 + doAssert d == 5 doNotOptimizeAway(d) -bench(dummy, m): - for i in 0 ..< m: - memoryClobber() + when true: + bench(runes, m): + var d = 0 + for i in 0 ..< m: + for _ in bench_text.runes: + d += 1 + doNotOptimizeAway(d) -when isMainModule: - runBenchmarks() + bench(dummy, m): + for i in 0 ..< m: + memoryClobber() + + when isMainModule: + runBenchmarks() -#[ -# Profiling: -# (but extract the bench to another module without nimbench) -# open the log with KCachegrind + #[ + # Profiling: + # (but extract the bench to another module without nimbench) + # open the log with KCachegrind -$ nim c --debugger:native --threads:off -d:danger -d:useMalloc -o:bin/bench2 bench/bench2.nim && valgrind --tool=callgrind -v ./bin/bench2 -]# + $ nim c --debugger:native --threads:off -d:danger -d:useMalloc -o:bin/bench2 bench/bench2.nim && valgrind --tool=callgrind -v ./bin/bench2 + ]# diff --git a/bench/bench2.nim b/bench/bench2.nim new file mode 100644 index 0000000..9f09372 --- /dev/null +++ b/bench/bench2.nim @@ -0,0 +1,23 @@ +import unicode +from regex import nil + +func genText(): string {.compileTime.} = + result = "" + for _ in 0 .. 100000: + result.add("a") + result.add("sol") + for _ in 0 .. 100000: + result.add("b") + #result.add("ฅ") +const text = genText() + +const pattern4 = regex.re2(r"\w*sol\w*") #, {regex.RegexFlag.reAscii}) + +proc runBenchmarks() = + var m2: regex.RegexMatch2 + for i in 0 ..< 500: + discard regex.match(text, pattern4, m2) + echo m2.captures + +when isMainModule: + runBenchmarks() \ No newline at end of file diff --git a/src/regex/nfa.nim b/src/regex/nfa.nim index cf16427..507c333 100644 --- a/src/regex/nfa.nim +++ b/src/regex/nfa.nim @@ -146,10 +146,10 @@ func eNfa*(exp: RpnExp): Enfa {.raises: [RegexError].} = result.s.add initSkipNode(states) type - Zclosure = seq[int16] - TeClosure = seq[(int16, Zclosure)] + Etransitions = seq[int16] # xxx transitions + TeClosure = seq[(int16, Etransitions)] -func isTransitionZ(n: Node): bool {.inline.} = +func isEpsilonTransition(n: Node): bool {.inline.} = result = case n.kind of groupKind: n.isCapturing @@ -163,24 +163,24 @@ func teClosure( eNfa: Enfa, state: int16, processing: var seq[int16], - zTransitions: Zclosure + eTransitions: Etransitions ) = - var zTransitionsCurr = zTransitions - if isTransitionZ eNfa.s[state]: - zTransitionsCurr.add state + var eTransitionsCurr = eTransitions + if isEpsilonTransition eNfa.s[state]: + eTransitionsCurr.add state if eNfa.s[state].kind in matchableKind + {reEOE}: - result.add (state, zTransitionsCurr) + result.add (state, eTransitionsCurr) return for i, s in pairs eNfa.s[state].next: # Enter loops only once. "a", re"(a*)*" -> ["a", ""] if eNfa.s[state].kind in repetitionKind: if s notin processing or i == int(eNfa.s[state].isGreedy): processing.add s - teClosure(result, eNfa, s, processing, zTransitionsCurr) + teClosure(result, eNfa, s, processing, eTransitionsCurr) discard processing.pop() # else skip loop else: - teClosure(result, eNfa, s, processing, zTransitionsCurr) + teClosure(result, eNfa, s, processing, eTransitionsCurr) func teClosure( result: var TeClosure, @@ -189,9 +189,9 @@ func teClosure( processing: var seq[int16] ) = doAssert processing.len == 0 - var zclosure: Zclosure + var eTransitions: Etransitions for s in eNfa.s[state].next: - teClosure(result, eNfa, s, processing, zclosure) + teClosure(result, eNfa, s, processing, eTransitions) when (NimMajor, NimMinor, NimPatch) < (1,4,0) and not declared(IndexDefect): # avoids a warning @@ -206,7 +206,6 @@ func eRemoval*(eNfa: Enfa): Nfa {.raises: [].} = #echo eNfa result.s = newSeq[Node](eNfa.s.len) result.s.setLen 0 - result.t.allZ.setLen eNfa.s.len var statesMap = newSeq[int16](eNfa.s.len) for i in 0 .. statesMap.len-1: statesMap[i] = -1 @@ -214,7 +213,6 @@ func eRemoval*(eNfa: Enfa): Nfa {.raises: [].} = result.s.add eNfa.s[start] statesMap[start] = 0'i16 var closure: TeClosure - var zc: seq[Node] var qw = initDeque[int16](2) qw.addFirst start var qu: set[int16] @@ -228,25 +226,21 @@ func eRemoval*(eNfa: Enfa): Nfa {.raises: [].} = doAssert false closure.setLen 0 teClosure(closure, eNfa, qa, processing) + doAssert statesMap[qa] > -1 result.s[statesMap[qa]].next.setLen 0 - for qb, zclosure in closure.items: + for qb, eTransitions in closure.items: + for eti in eTransitions: + if statesMap[eti] == -1: + result.s.add eNfa.s[eti] + statesMap[eti] = result.s.len.int16-1 + result.s[statesMap[qa]].next.add statesMap[eti] if statesMap[qb] == -1: result.s.add eNfa.s[qb] statesMap[qb] = result.s.len.int16-1 - doAssert statesMap[qb] > -1 - doAssert statesMap[qa] > -1 result.s[statesMap[qa]].next.add statesMap[qb] - result.t.allZ[statesMap[qa]].add -1'i16 - zc.setLen 0 - for z in zclosure: - zc.add eNfa.s[z] - if zc.len > 0: - result.t.z.add zc - result.t.allZ[statesMap[qa]][^1] = int16(result.t.z.len-1) if qb notin qu: qu.incl qb qw.addFirst qb - result.t.allZ.setLen result.s.len func reverse(eNfa: Enfa): Enfa = template state0: untyped = int16(eNfa.s.len-1) diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index f3e1bdc..31e23ba 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -67,11 +67,22 @@ template lookAroundTpl*: untyped {.dirty.} = false smL.removeLast() +func isEpsilonTransition(n: Node): bool {.inline.} = + result = case n.kind + of groupKind, assertionKind: + true + else: + false + +template s(nfa: openArray[Node]): untyped = + nfa + template nextStateTpl(bwMatch = false): untyped {.dirty.} = template bounds2: untyped = when bwMatch: i .. bounds.b else: bounds.a .. i-1 template captElm: untyped = - capts[captx, z.idx] + capts[captx, nfa.s[nt].idx] + template z: untyped = nfa.s[nt] smB.clear() for n, capt, bounds in items smA: if capt != -1: @@ -80,24 +91,29 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = if not smB.hasState n: smB.add (n, capt, bounds) break - for nti, nt in pairs nfa.s[n].next: - if smB.hasState nt: - continue - if not match(nfa.s[nt], c): - if not (anchored and nfa.s[nt].kind == reEoe): + matched = true + captx = capt + for nti, nt in pairs toOpenArray(nfa.s[n].next, 0, nfa.s[n].next.len-1): + if not isEpsilonTransition(nfa.s[n]): + if not matched: + matched = true + captx = capt continue - if nfa.t.allZ[n][nti] == -1'i16: - smB.add (nt, capt, bounds2) + if smB.hasState nt: + captx = capt + continue + if not match(nfa.s[nt], c): + if not (anchored and nfa.s[nt].kind == reEoe): + captx = capt + continue + smB.add (nt, captx, bounds2) + captx = capt continue - matched = true - captx = capt - for z in nfa.t.z[nfa.t.allZ[n][nti]]: - if not matched: - break - case z.kind + if not matched: + continue + case nfa.s[nt].kind of reGroupStart: - # XXX this can be avoided on 1st z loop iteration - # and also on 1st nti loop iteration + # XXX this can be avoided in some cases? captx = capts.diverge captx if mfReverseCapts notin flags or captElm.a == nonCapture.a: @@ -109,9 +125,9 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = captElm.b = i-1 of assertionKind - lookaroundKind: when bwMatch: - matched = match(z, c, cPrev.Rune) + matched = match(nfa.s[nt], c, cPrev.Rune) else: - matched = match(z, cPrev.Rune, c) + matched = match(nfa.s[nt], cPrev.Rune, c) of lookaroundKind: let freezed = capts.freeze() lookAroundTpl() @@ -121,8 +137,6 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = else: doAssert false discard - if matched: - smB.add (nt, captx, bounds2) swap smA, smB capts.recycle() From 8fac4837d3b20fd14fe41f53f40bb81044a4eb7c Mon Sep 17 00:00:00 2001 From: nitely Date: Wed, 13 Sep 2023 20:55:13 -0300 Subject: [PATCH 2/9] progress --- src/regex.nim | 5 +++- src/regex/nfafindall.nim | 48 ++++++++++++++++----------------- src/regex/nfafindall2.nim | 56 +++++++++++++++++++-------------------- src/regex/nfamatch.nim | 23 +++++++--------- src/regex/nfamatch2.nim | 41 +++++++++------------------- src/regex/types.nim | 7 +++++ 6 files changed, 86 insertions(+), 94 deletions(-) diff --git a/src/regex.nim b/src/regex.nim index 5beb1e1..d2aa408 100644 --- a/src/regex.nim +++ b/src/regex.nim @@ -301,7 +301,10 @@ import ./regex/nfamatch2 when not defined(noRegexOpt): import ./regex/litopt -const canUseMacro = (NimMajor, NimMinor) >= (1, 1) +when not defined(noRegexMacro): + const canUseMacro = (NimMajor, NimMinor) >= (1, 1) +else: + const canUseMacro = false when canUseMacro: import ./regex/nfamacro diff --git a/src/regex/nfafindall.nim b/src/regex/nfafindall.nim index 864d4d0..e748985 100644 --- a/src/regex/nfafindall.nim +++ b/src/regex/nfafindall.nim @@ -118,38 +118,37 @@ func submatch( template capt: untyped = ms.a[smi].ci template bounds: untyped = ms.a[smi].bounds template look: untyped = ms.look + template z: untyped = nfa[nt] + template nt: untyped = nfa[n].next[nti] smB.clear() var captx: int32 var matched = true var eoeFound = false var smi = 0 while smi < smA.len: - for nti, nt in nfa[n].next.pairs: - if smB.hasState nt: - continue - if nfa[nt].kind != reEoe and not match(nfa[nt], c.Rune): - continue + var nti = 0 + while nti <= nfa[n].next.len-1: matched = true captx = capt - if tns.allZ[n][nti] > -1: - for z in tns.z[tns.allZ[n][nti]]: - if not matched: - break - case z.kind - of groupKind: - capts.add CaptNode( - parent: captx, - bound: i, - idx: z.idx) - captx = (capts.len-1).int32 - of assertionKind - lookaroundKind: - matched = match(z, cPrev.Rune, c.Rune) - of lookaroundKind: - lookAroundTpl() - else: - assert false - discard - if matched: + while isEpsilonTransition(nfa[nt]) and matched: + case z.kind + of groupKind: + capts.add CaptNode( + parent: captx, + bound: i, + idx: z.idx) + captx = (capts.len-1).int32 + of assertionKind - lookaroundKind: + matched = match(z, cPrev.Rune, c.Rune) + of lookaroundKind: + lookAroundTpl() + else: + assert false + discard + inc nti + if matched and + not smB.hasState(nt) and + (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): if nfa[nt].kind == reEoe: #debugEcho "eoe ", bounds, " ", ms.m ms.m.add (captx, bounds.a .. i-1) @@ -160,6 +159,7 @@ func submatch( smi = -1 break smB.add (nt, captx, bounds.a .. i-1) + inc nti inc smi swap smA, smB diff --git a/src/regex/nfafindall2.nim b/src/regex/nfafindall2.nim index 5a2a7cd..1e47623 100644 --- a/src/regex/nfafindall2.nim +++ b/src/regex/nfafindall2.nim @@ -155,6 +155,8 @@ func submatch( template capt: untyped = ms.a[smi].ci template bounds: untyped = ms.a[smi].bounds template look: untyped = ms.look + template z: untyped = nfa[nt] + template nt: untyped = nfa[n].next[nti] smB.clear() var captx: int32 var matched = true @@ -163,36 +165,33 @@ func submatch( while smi < smA.len: if capt != -1: capts.keepAlive capt - for nti, nt in nfa[n].next.pairs: - if smB.hasState nt: - continue - if nfa[nt].kind != reEoe and not match(nfa[nt], c.Rune): - continue + var nti = 0 + while nti <= nfa[n].next.len-1: matched = true captx = capt - if tns.allZ[n][nti] > -1: - for z in tns.z[tns.allZ[n][nti]]: - if not matched: - break - case z.kind - of reGroupStart: - captx = capts.diverge captx - capts[captx, z.idx].a = i - of reGroupEnd: - captx = capts.diverge captx - capts[captx, z.idx].b = i-1 - of assertionKind - lookaroundKind: - matched = match(z, cPrev.Rune, c.Rune) - of lookaroundKind: - let freezed = capts.freeze() - lookAroundTpl() - capts.unfreeze freezed - if captx != -1: - capts.keepAlive captx - else: - assert false - discard - if matched: + while isEpsilonTransition(nfa[nt]) and matched: + case z.kind + of reGroupStart: + captx = capts.diverge captx + capts[captx, z.idx].a = i + of reGroupEnd: + captx = capts.diverge captx + capts[captx, z.idx].b = i-1 + of assertionKind - lookaroundKind: + matched = match(z, cPrev.Rune, c.Rune) + of lookaroundKind: + let freezed = capts.freeze() + lookAroundTpl() + capts.unfreeze freezed + if captx != -1: + capts.keepAlive captx + else: + assert false + discard + inc nti + if matched and + not smB.hasState(nt) and + (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): if nfa[nt].kind == reEoe: #debugEcho "eoe ", bounds, " ", ms.m ms.add (captx, bounds.a .. i-1) @@ -203,6 +202,7 @@ func submatch( smi = -1 break smB.add (nt, captx, bounds.a .. i-1) + inc nti inc smi swap smA, smB capts.recycle() diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index fc3db56..3c8fce0 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -70,26 +70,19 @@ template lookAroundTpl*: untyped {.dirty.} = template nextStateTpl(bwMatch = false): untyped {.dirty.} = template bounds2: untyped = when bwMatch: i .. bounds.b else: bounds.a .. i-1 + template z: untyped = nfa.s[nt] + template nt: untyped = nfa.s[n].next[nti] smB.clear() for n, capt, bounds in items smA: if anchored and nfa.s[n].kind == reEoe: if not smB.hasState n: smB.add (n, capt, bounds) break - for nti, nt in pairs nfa.s[n].next: - if smB.hasState nt: - continue - if not match(nfa.s[nt], c): - if not (anchored and nfa.s[nt].kind == reEoe): - continue - if nfa.t.allZ[n][nti] == -1'i16: - smB.add (nt, capt, bounds2) - continue + var nti = 0 + while nti <= nfa.s[n].next.len-1: matched = true captx = capt - for z in nfa.t.z[nfa.t.allZ[n][nti]]: - if not matched: - break + while isEpsilonTransition(nfa.s[nt]) and matched: case z.kind of groupKind: capts.add CaptNode( @@ -107,8 +100,12 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = else: doAssert false discard - if matched: + inc nti + if matched and + not smB.hasState(nt) and + (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): smB.add (nt, captx, bounds2) + inc nti swap smA, smB func matchImpl( diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index 31e23ba..6606ceb 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -67,13 +67,6 @@ template lookAroundTpl*: untyped {.dirty.} = false smL.removeLast() -func isEpsilonTransition(n: Node): bool {.inline.} = - result = case n.kind - of groupKind, assertionKind: - true - else: - false - template s(nfa: openArray[Node]): untyped = nfa @@ -83,6 +76,7 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = template captElm: untyped = capts[captx, nfa.s[nt].idx] template z: untyped = nfa.s[nt] + template nt: untyped = nfa.s[n].next[nti] smB.clear() for n, capt, bounds in items smA: if capt != -1: @@ -91,27 +85,12 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = if not smB.hasState n: smB.add (n, capt, bounds) break - matched = true - captx = capt - for nti, nt in pairs toOpenArray(nfa.s[n].next, 0, nfa.s[n].next.len-1): - if not isEpsilonTransition(nfa.s[n]): - if not matched: - matched = true - captx = capt - continue - if smB.hasState nt: - captx = capt - continue - if not match(nfa.s[nt], c): - if not (anchored and nfa.s[nt].kind == reEoe): - captx = capt - continue - smB.add (nt, captx, bounds2) - captx = capt - continue - if not matched: - continue - case nfa.s[nt].kind + var nti = 0 + while nti <= nfa.s[n].next.len-1: + matched = true + captx = capt + while isEpsilonTransition(nfa.s[nt]) and matched: + case nfa.s[nt].kind of reGroupStart: # XXX this can be avoided in some cases? captx = capts.diverge captx @@ -137,6 +116,12 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = else: doAssert false discard + inc nti + if matched and + not smB.hasState(nt) and + (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): + smB.add (nt, captx, bounds2) + inc nti swap smA, smB capts.recycle() diff --git a/src/regex/types.nim b/src/regex/types.nim index 569a1dd..e3b329a 100644 --- a/src/regex/types.nim +++ b/src/regex/types.nim @@ -278,6 +278,13 @@ const reGroupEnd} groupStartKind* = {reGroupStart} + lookaroundKind +func isEpsilonTransition*(n: Node): bool {.inline.} = + result = case n.kind + of groupKind, assertionKind: + true + else: + false + func `$`*(n: Node): string = ## return the string representation ## of a `Node`. The string is always From 0e598756a4696b2773fb99360b3408ec11ccd2b5 Mon Sep 17 00:00:00 2001 From: nitely Date: Thu, 14 Sep 2023 22:56:06 -0300 Subject: [PATCH 3/9] progress --- src/regex/nfamacro.nim | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index 9f032df..a4535f8 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -4,6 +4,7 @@ import std/macros import std/unicode import std/tables import std/sets +import std/algorithm import pkg/unicodedb/properties import pkg/unicodedb/types as utypes @@ -268,6 +269,13 @@ func genLookaroundMatch( `lookaroundStmt` removeLast `smL` +func getEpsilonTransitions(nfa: Nfa, n: Node, nti: int): seq[int] = + for i in countdown(nti-1, 0): + if not isEpsilonTransition(nfa.s[n.next[i]]): + break + result.add n.next[i] + result.reverse() + func genMatchedBody( smB, ntLit, capt, bounds, matched, captx, capts, charIdx, cPrev, c, text: NimNode, @@ -276,19 +284,21 @@ func genMatchedBody( look: Lookaround, flags: set[MatchFlag] ): NimNode = - template t: untyped = nfa.t + template n: untyped = nfa.s[i] + template z: untyped = nfa.s[eti] let bounds2 = if mfBwMatch in flags: quote do: `charIdx` .. `bounds`.b else: quote do: `bounds`.a .. `charIdx`-1 - if t.allZ[i][nti] == -1'i16: + let eTransitions = getEpsilonTransitions(nfa, n, nti) + if eTransitions.len == 0: return quote do: add(`smB`, (`ntLit`, `capt`, `bounds2`)) var matchedBody: seq[NimNode] matchedBody.add quote do: `matched` = true `captx` = `capt` - for z in t.z[t.allZ[i][nti]]: + for eti in eTransitions: case z.kind of groupKind: let zIdx = newLit z.idx @@ -347,10 +357,14 @@ func genNextState( for i in 0 .. s.len-1: if s[i].kind == reEoe: continue + if isEpsilonTransition(s[i]): + continue var branchBodyN: seq[NimNode] for nti, nt in s[i].next.pairs: if eoeOnly and s[nt].kind != reEoe: continue + if isEpsilonTransition(s[nt]): + continue let matchCond = case s[nt].kind of reEoe: quote do: `c` == -1'i32 From e3c5893207a0f72569ddbada1b6dbce810b208af Mon Sep 17 00:00:00 2001 From: nitely Date: Thu, 14 Sep 2023 23:28:43 -0300 Subject: [PATCH 4/9] progress --- src/regex.nim | 8 ++++---- src/regex/dotgraph.nim | 27 +++++++++++++++++++-------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/regex.nim b/src/regex.nim index d2aa408..0c7dba4 100644 --- a/src/regex.nim +++ b/src/regex.nim @@ -1371,10 +1371,10 @@ when isMainModule: doAssert graph(Regex(re2"^a+$")) == """digraph graphname { 0 [label="q0";color=blue]; - 1 [label="q1";color=black]; - 2 [label="q2";color=blue]; - 0 -> 1 [label="a, {^}, i=0"]; - 1 -> 1 [label="a, i=0"];1 -> 2 [label="{eoe}, {$}, i=1"]; + 2 [label="q1";color=black]; + 4 [label="q2";color=blue]; + 0 -> 2 [label="a, {^}, i=0"]; + 2 -> 2 [label="a, i=0"];2 -> 4 [label="{eoe}, {$}, i=1"]; } """ diff --git a/src/regex/dotgraph.nim b/src/regex/dotgraph.nim index a1ddd61..3548f4c 100644 --- a/src/regex/dotgraph.nim +++ b/src/regex/dotgraph.nim @@ -14,23 +14,34 @@ func color(n: Node): string = func graph*(nfa: Nfa): string = result = "digraph graphname {\n" let tab = " " + var qi = 0 for i, n in pairs nfa.s: + if isEpsilonTransition(n): + continue result.add tab - result.add($i & " [label=\"q" & $i & "\";color=" & n.color & "];") + result.add($i & " [label=\"q" & $qi & "\";color=" & n.color & "];") result.add '\n' + inc qi for i, n in pairs nfa.s: if n.next.len == 0: continue + if isEpsilonTransition(n): + continue result.add tab - for i2, n2 in pairs n.next: - var t = "" - if nfa.t.allZ[i][i2] > -1: - for i3, z in pairs nfa.t.z[nfa.t.allZ[i][i2]]: - if i3 > 0: t &= ", " - t &= $z + var t = "" + var ii = 0 + for n2 in n.next: + if isEpsilonTransition(nfa.s[n2]): + if t.len > 0: + t &= ", " + t &= $nfa.s[n2] + continue + if t.len > 0: t = ", {" & t & "}" - let label = ($nfa.s[n2] & t & ", i=" & $i2).replace(r"\", r"\\") + let label = ($nfa.s[n2] & t & ", i=" & $ii).replace(r"\", r"\\") result.add($i & " -> " & $n2 & " [label=\"" & label & "\"];") + t = "" + inc ii result.add '\n' result.add "}\n" From 3c6db18071379560a00508d1aeebe8d395cabb1c Mon Sep 17 00:00:00 2001 From: nitely Date: Thu, 14 Sep 2023 23:45:32 -0300 Subject: [PATCH 5/9] progress --- src/regex.nim | 2 ++ src/regex/litopt.nim | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/regex.nim b/src/regex.nim index 0c7dba4..42e6459 100644 --- a/src/regex.nim +++ b/src/regex.nim @@ -1146,6 +1146,8 @@ proc toString( result = "[" result.add($n) for nn in n.next: + if isEpsilonTransition(pattern.nfa.s[nn]): + continue result.add(", ") result.add(pattern.toString(nn, visited)) result.add("]") diff --git a/src/regex/litopt.nim b/src/regex/litopt.nim index 7285ca0..f378da6 100644 --- a/src/regex/litopt.nim +++ b/src/regex/litopt.nim @@ -287,6 +287,8 @@ when isMainModule: result = "[" result.add $n.cp for nn in n.next: + if isEpsilonTransition(nfa.s[nn]): + continue result.add ", " result.add toString(nfa, nn, visited) result.add "]" From ba29d9079e66dbd747577ae40ece0ece565c9417 Mon Sep 17 00:00:00 2001 From: nitely Date: Fri, 15 Sep 2023 17:41:07 -0300 Subject: [PATCH 6/9] progress --- src/regex/nfafindall.nim | 5 ++++- src/regex/nfafindall2.nim | 5 ++++- src/regex/nfamatch.nim | 3 +++ src/regex/nfamatch2.nim | 3 +++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/regex/nfafindall.nim b/src/regex/nfafindall.nim index e748985..db953bc 100644 --- a/src/regex/nfafindall.nim +++ b/src/regex/nfafindall.nim @@ -143,9 +143,12 @@ func submatch( of lookaroundKind: lookAroundTpl() else: - assert false + doAssert false discard inc nti + while isEpsilonTransition(nfa[nt]): + # skip unmatched epsilons + inc nti if matched and not smB.hasState(nt) and (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): diff --git a/src/regex/nfafindall2.nim b/src/regex/nfafindall2.nim index 1e47623..0868ce3 100644 --- a/src/regex/nfafindall2.nim +++ b/src/regex/nfafindall2.nim @@ -186,9 +186,12 @@ func submatch( if captx != -1: capts.keepAlive captx else: - assert false + doAssert false discard inc nti + while isEpsilonTransition(nfa[nt]): + # skip unmatched epsilons + inc nti if matched and not smB.hasState(nt) and (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index 3c8fce0..470bf4a 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -101,6 +101,9 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = doAssert false discard inc nti + while isEpsilonTransition(nfa.s[nt]): + # skip unmatched epsilons + inc nti if matched and not smB.hasState(nt) and (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index 6606ceb..3c15a8b 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -117,6 +117,9 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = doAssert false discard inc nti + while isEpsilonTransition(nfa.s[nt]): + # skip unmatched epsilons + inc nti if matched and not smB.hasState(nt) and (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): From 6fa5e1fd66fd5a01a601a42ff2def08a53b50a07 Mon Sep 17 00:00:00 2001 From: nitely Date: Fri, 15 Sep 2023 18:05:11 -0300 Subject: [PATCH 7/9] cleanup --- src/regex.nim | 5 +-- src/regex/nfa.nim | 4 +-- src/regex/nfafindall.nim | 41 +++++++++++------------ src/regex/nfafindall2.nim | 49 +++++++++++++-------------- src/regex/nfamatch.nim | 48 +++++++++++++-------------- src/regex/nfamatch2.nim | 69 ++++++++++++++++++--------------------- src/regex/types.nim | 6 ---- 7 files changed, 100 insertions(+), 122 deletions(-) diff --git a/src/regex.nim b/src/regex.nim index 42e6459..b958cfc 100644 --- a/src/regex.nim +++ b/src/regex.nim @@ -301,10 +301,7 @@ import ./regex/nfamatch2 when not defined(noRegexOpt): import ./regex/litopt -when not defined(noRegexMacro): - const canUseMacro = (NimMajor, NimMinor) >= (1, 1) -else: - const canUseMacro = false +const canUseMacro = (NimMajor, NimMinor) >= (1, 1) when canUseMacro: import ./regex/nfamacro diff --git a/src/regex/nfa.nim b/src/regex/nfa.nim index 507c333..f51afb4 100644 --- a/src/regex/nfa.nim +++ b/src/regex/nfa.nim @@ -149,7 +149,7 @@ type Etransitions = seq[int16] # xxx transitions TeClosure = seq[(int16, Etransitions)] -func isEpsilonTransition(n: Node): bool {.inline.} = +func isEpsilonTransition2(n: Node): bool {.inline.} = result = case n.kind of groupKind: n.isCapturing @@ -166,7 +166,7 @@ func teClosure( eTransitions: Etransitions ) = var eTransitionsCurr = eTransitions - if isEpsilonTransition eNfa.s[state]: + if isEpsilonTransition2 eNfa.s[state]: eTransitionsCurr.add state if eNfa.s[state].kind in matchableKind + {reEOE}: result.add (state, eTransitionsCurr) diff --git a/src/regex/nfafindall.nim b/src/regex/nfafindall.nim index db953bc..200efad 100644 --- a/src/regex/nfafindall.nim +++ b/src/regex/nfafindall.nim @@ -109,7 +109,6 @@ func submatch( i: int, cPrev, c: int32 ) {.inline.} = - template tns: untyped = regex.nfa.t template nfa: untyped = regex.nfa.s template smA: untyped = ms.a template smB: untyped = ms.b @@ -118,8 +117,8 @@ func submatch( template capt: untyped = ms.a[smi].ci template bounds: untyped = ms.a[smi].bounds template look: untyped = ms.look - template z: untyped = nfa[nt] template nt: untyped = nfa[n].next[nti] + template ntn: untyped = nfa[nt] smB.clear() var captx: int32 var matched = true @@ -130,29 +129,27 @@ func submatch( while nti <= nfa[n].next.len-1: matched = true captx = capt - while isEpsilonTransition(nfa[nt]) and matched: - case z.kind - of groupKind: - capts.add CaptNode( - parent: captx, - bound: i, - idx: z.idx) - captx = (capts.len-1).int32 - of assertionKind - lookaroundKind: - matched = match(z, cPrev.Rune, c.Rune) - of lookaroundKind: - lookAroundTpl() - else: - doAssert false - discard - inc nti - while isEpsilonTransition(nfa[nt]): - # skip unmatched epsilons + while isEpsilonTransition(ntn): + if matched: + case ntn.kind + of groupKind: + capts.add CaptNode( + parent: captx, + bound: i, + idx: ntn.idx) + captx = (capts.len-1).int32 + of assertionKind - lookaroundKind: + matched = match(ntn, cPrev.Rune, c.Rune) + of lookaroundKind: + lookAroundTpl() + else: + doAssert false + discard inc nti if matched and not smB.hasState(nt) and - (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): - if nfa[nt].kind == reEoe: + (ntn.match(c.Rune) or ntn.kind == reEoe): + if ntn.kind == reEoe: #debugEcho "eoe ", bounds, " ", ms.m ms.m.add (captx, bounds.a .. i-1) smA.clear() diff --git a/src/regex/nfafindall2.nim b/src/regex/nfafindall2.nim index 0868ce3..fdd3500 100644 --- a/src/regex/nfafindall2.nim +++ b/src/regex/nfafindall2.nim @@ -146,7 +146,6 @@ func submatch( i: int, cPrev, c: int32 ) {.inline.} = - template tns: untyped = regex.nfa.t template nfa: untyped = regex.nfa.s template smA: untyped = ms.a template smB: untyped = ms.b @@ -155,8 +154,8 @@ func submatch( template capt: untyped = ms.a[smi].ci template bounds: untyped = ms.a[smi].bounds template look: untyped = ms.look - template z: untyped = nfa[nt] template nt: untyped = nfa[n].next[nti] + template ntn: untyped = nfa[nt] smB.clear() var captx: int32 var matched = true @@ -169,33 +168,31 @@ func submatch( while nti <= nfa[n].next.len-1: matched = true captx = capt - while isEpsilonTransition(nfa[nt]) and matched: - case z.kind - of reGroupStart: - captx = capts.diverge captx - capts[captx, z.idx].a = i - of reGroupEnd: - captx = capts.diverge captx - capts[captx, z.idx].b = i-1 - of assertionKind - lookaroundKind: - matched = match(z, cPrev.Rune, c.Rune) - of lookaroundKind: - let freezed = capts.freeze() - lookAroundTpl() - capts.unfreeze freezed - if captx != -1: - capts.keepAlive captx - else: - doAssert false - discard - inc nti - while isEpsilonTransition(nfa[nt]): - # skip unmatched epsilons + while isEpsilonTransition(ntn): + if matched: + case ntn.kind + of reGroupStart: + captx = capts.diverge captx + capts[captx, ntn.idx].a = i + of reGroupEnd: + captx = capts.diverge captx + capts[captx, ntn.idx].b = i-1 + of assertionKind - lookaroundKind: + matched = match(ntn, cPrev.Rune, c.Rune) + of lookaroundKind: + let freezed = capts.freeze() + lookAroundTpl() + capts.unfreeze freezed + if captx != -1: + capts.keepAlive captx + else: + doAssert false + discard inc nti if matched and not smB.hasState(nt) and - (nfa[nt].match(c.Rune) or nfa[nt].kind == reEoe): - if nfa[nt].kind == reEoe: + (ntn.match(c.Rune) or ntn.kind == reEoe): + if ntn.kind == reEoe: #debugEcho "eoe ", bounds, " ", ms.m ms.add (captx, bounds.a .. i-1) smA.clear() diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index 470bf4a..863d74a 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -38,14 +38,14 @@ template lookAroundTpl*: untyped {.dirty.} = template smL: untyped = look.smL template smLa: untyped = smL.lastA template smLb: untyped = smL.lastB - template zNfa: untyped = z.subExp.nfa - let flags2 = if z.subExp.reverseCapts: + template zNfa: untyped = ntn.subExp.nfa + let flags2 = if ntn.subExp.reverseCapts: {mfAnchored, mfReverseCapts} else: {mfAnchored} smL.grow() smL.last.setLen zNfa.s.len - matched = case z.kind + matched = case ntn.kind of reLookahead: look.ahead( smLa, smLb, capts, captx, @@ -70,8 +70,8 @@ template lookAroundTpl*: untyped {.dirty.} = template nextStateTpl(bwMatch = false): untyped {.dirty.} = template bounds2: untyped = when bwMatch: i .. bounds.b else: bounds.a .. i-1 - template z: untyped = nfa.s[nt] template nt: untyped = nfa.s[n].next[nti] + template ntn: untyped = nfa.s[nt] smB.clear() for n, capt, bounds in items smA: if anchored and nfa.s[n].kind == reEoe: @@ -82,31 +82,29 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = while nti <= nfa.s[n].next.len-1: matched = true captx = capt - while isEpsilonTransition(nfa.s[nt]) and matched: - case z.kind - of groupKind: - capts.add CaptNode( - parent: captx, - bound: i, - idx: z.idx) - captx = (capts.len-1).int32 - of assertionKind - lookaroundKind: - when bwMatch: - matched = match(z, c, cPrev.Rune) + while isEpsilonTransition(ntn): + if matched: + case ntn.kind + of groupKind: + capts.add CaptNode( + parent: captx, + bound: i, + idx: ntn.idx) + captx = (capts.len-1).int32 + of assertionKind - lookaroundKind: + when bwMatch: + matched = match(ntn, c, cPrev.Rune) + else: + matched = match(ntn, cPrev.Rune, c) + of lookaroundKind: + lookAroundTpl() else: - matched = match(z, cPrev.Rune, c) - of lookaroundKind: - lookAroundTpl() - else: - doAssert false - discard - inc nti - while isEpsilonTransition(nfa.s[nt]): - # skip unmatched epsilons + doAssert false + discard inc nti if matched and not smB.hasState(nt) and - (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): + (ntn.match(c) or (anchored and ntn.kind == reEoe)): smB.add (nt, captx, bounds2) inc nti swap smA, smB diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index 3c15a8b..91c35e5 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -38,14 +38,14 @@ template lookAroundTpl*: untyped {.dirty.} = template smL: untyped = look.smL template smLa: untyped = smL.lastA template smLb: untyped = smL.lastB - template zNfa: untyped = z.subExp.nfa - let flags2 = if z.subExp.reverseCapts: + template zNfa: untyped = ntn.subExp.nfa + let flags2 = if ntn.subExp.reverseCapts: {mfAnchored, mfReverseCapts} else: {mfAnchored} smL.grow() smL.last.setLen zNfa.s.len - matched = case z.kind + matched = case ntn.kind of reLookahead: look.ahead( smLa, smLb, capts, captx, @@ -67,16 +67,13 @@ template lookAroundTpl*: untyped {.dirty.} = false smL.removeLast() -template s(nfa: openArray[Node]): untyped = - nfa - template nextStateTpl(bwMatch = false): untyped {.dirty.} = template bounds2: untyped = when bwMatch: i .. bounds.b else: bounds.a .. i-1 template captElm: untyped = capts[captx, nfa.s[nt].idx] - template z: untyped = nfa.s[nt] template nt: untyped = nfa.s[n].next[nti] + template ntn: untyped = nfa.s[nt] smB.clear() for n, capt, bounds in items smA: if capt != -1: @@ -89,40 +86,38 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = while nti <= nfa.s[n].next.len-1: matched = true captx = capt - while isEpsilonTransition(nfa.s[nt]) and matched: - case nfa.s[nt].kind - of reGroupStart: - # XXX this can be avoided in some cases? - captx = capts.diverge captx - if mfReverseCapts notin flags or - captElm.a == nonCapture.a: - captElm.a = i - of reGroupEnd: - captx = capts.diverge captx - if mfReverseCapts notin flags or - captElm.b == nonCapture.b: - captElm.b = i-1 - of assertionKind - lookaroundKind: - when bwMatch: - matched = match(nfa.s[nt], c, cPrev.Rune) + while isEpsilonTransition(ntn): + if matched: + case ntn.kind + of reGroupStart: + # XXX this can be avoided in some cases? + captx = capts.diverge captx + if mfReverseCapts notin flags or + captElm.a == nonCapture.a: + captElm.a = i + of reGroupEnd: + captx = capts.diverge captx + if mfReverseCapts notin flags or + captElm.b == nonCapture.b: + captElm.b = i-1 + of assertionKind - lookaroundKind: + when bwMatch: + matched = match(ntn, c, cPrev.Rune) + else: + matched = match(ntn, cPrev.Rune, c) + of lookaroundKind: + let freezed = capts.freeze() + lookAroundTpl() + capts.unfreeze freezed + if captx != -1: + capts.keepAlive captx else: - matched = match(nfa.s[nt], cPrev.Rune, c) - of lookaroundKind: - let freezed = capts.freeze() - lookAroundTpl() - capts.unfreeze freezed - if captx != -1: - capts.keepAlive captx - else: - doAssert false - discard - inc nti - while isEpsilonTransition(nfa.s[nt]): - # skip unmatched epsilons + doAssert false + discard inc nti if matched and not smB.hasState(nt) and - (nfa.s[nt].match(c) or (anchored and nfa.s[nt].kind == reEoe)): + (ntn.match(c) or (anchored and ntn.kind == reEoe)): smB.add (nt, captx, bounds2) inc nti swap smA, smB diff --git a/src/regex/types.nim b/src/regex/types.nim index e3b329a..c275fff 100644 --- a/src/regex/types.nim +++ b/src/regex/types.nim @@ -28,14 +28,8 @@ type # nfatype.nim Enfa* = object s*: seq[Node] - TransitionsAll* = seq[seq[int16]] - ZclosureStates* = seq[seq[Node]] - Transitions* = object - allZ*: TransitionsAll - z*: ZclosureStates Nfa* = object s*: seq[Node] - t*: Transitions # nodetype.nim Flag* = enum From 86bf5381efaac70d35f6ccf13e1975a3be1c1a77 Mon Sep 17 00:00:00 2001 From: nitely Date: Fri, 15 Sep 2023 18:13:56 -0300 Subject: [PATCH 8/9] cleanup --- bench/bench.nim | 240 +++++++++++++++++++++++------------------------ bench/bench2.nim | 23 ----- 2 files changed, 118 insertions(+), 145 deletions(-) delete mode 100644 bench/bench2.nim diff --git a/bench/bench.nim b/bench/bench.nim index 8d48ec1..09967d9 100644 --- a/bench/bench.nim +++ b/bench/bench.nim @@ -29,163 +29,159 @@ benchRelative(regex_sol, m): discard regex.match(text, pattern4, m2) doNotOptimizeAway(m2) -when isMainModule: - runBenchmarks() +benchRelative(regex_macro_sol, m): + var d: bool + for i in 0 ..< m: + regex.match text, regex.rex"\w*sol\w*": + d = true + doNotOptimizeAway(d) -when false: - benchRelative(regex_macro_sol, m): - var d: bool - for i in 0 ..< m: - regex.match text, regex.rex"\w*sol\w*": - d = true - doNotOptimizeAway(d) +var dummyTextNums = """650-253-0001""" - var dummyTextNums = """650-253-0001""" +var pattern_nums = re.re"^[0-9]+-[0-9]+-[0-9]+$" - var pattern_nums = re.re"^[0-9]+-[0-9]+-[0-9]+$" +bench(re_nums, m): + var d: bool + for i in 0 ..< m: + d = re.match(dummyTextNums, pattern_nums) + doNotOptimizeAway(d) - bench(re_nums, m): - var d: bool - for i in 0 ..< m: - d = re.match(dummyTextNums, pattern_nums) - doNotOptimizeAway(d) +const n_pattern_nums = regex.re2"[0-9]+-[0-9]+-[0-9]+" - const n_pattern_nums = regex.re2"[0-9]+-[0-9]+-[0-9]+" +benchRelative(regex_nums, m): + var m2: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.match(dummyTextNums, n_pattern_nums, m2) + doNotOptimizeAway(m2) - benchRelative(regex_nums, m): - var m2: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.match(dummyTextNums, n_pattern_nums, m2) - doNotOptimizeAway(m2) +benchRelative(regex_macro_nums, m): + var d: bool + for i in 0 ..< m: + regex.match text, regex.rex"[0-9]+-[0-9]+-[0-9]+": + d = true + doNotOptimizeAway(d) - benchRelative(regex_macro_nums, m): - var d: bool - for i in 0 ..< m: - regex.match text, regex.rex"[0-9]+-[0-9]+-[0-9]+": - d = true - doNotOptimizeAway(d) +var pattern_nums2 = re.re"^[0-9]+..*$" - var pattern_nums2 = re.re"^[0-9]+..*$" +bench(re_nums2, m): + var d: bool + for i in 0 ..< m: + d = re.match(dummyTextNums, pattern_nums2) + doNotOptimizeAway(d) - bench(re_nums2, m): - var d: bool - for i in 0 ..< m: - d = re.match(dummyTextNums, pattern_nums2) - doNotOptimizeAway(d) +const n_pattern_nums2 = regex.re2"[0-9]+..*" - const n_pattern_nums2 = regex.re2"[0-9]+..*" +benchRelative(regex_nums2, m): + var m3: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.match(dummyTextNums, n_pattern_nums2, m3) + doNotOptimizeAway(m3) - benchRelative(regex_nums2, m): - var m3: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.match(dummyTextNums, n_pattern_nums2, m3) - doNotOptimizeAway(m3) +benchRelative(regex_macro_nums2, m): + var d: bool + for i in 0 ..< m: + regex.match text, regex.rex"[0-9]+..*": + d = true + doNotOptimizeAway(d) + +when false: # XXX remove + var lits_find_re = re.re"do|re|mi|fa|sol" - benchRelative(regex_macro_nums2, m): - var d: bool + bench(re_lits_find, m): + var d: int for i in 0 ..< m: - regex.match text, regex.rex"[0-9]+..*": - d = true + d = re.find(text, lits_find_re) doNotOptimizeAway(d) - when false: # XXX remove - var lits_find_re = re.re"do|re|mi|fa|sol" + const lits_find = regex.re2"do|re|mi|fa|sol" - bench(re_lits_find, m): - var d: int - for i in 0 ..< m: - d = re.find(text, lits_find_re) - doNotOptimizeAway(d) - - const lits_find = regex.re2"do|re|mi|fa|sol" + benchRelative(regex_lits_find, m): + var m2: regex.RegexMatch2 + for i in 0 ..< m: + discard regex.find(text, lits_find, m2) + doNotOptimizeAway(m2) - benchRelative(regex_lits_find, m): - var m2: regex.RegexMatch2 - for i in 0 ..< m: - discard regex.find(text, lits_find, m2) - doNotOptimizeAway(m2) +const bench_text = staticRead("input-text.txt") - const bench_text = staticRead("input-text.txt") +var email_find_all_re = re.re"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" - var email_find_all_re = re.re"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" +bench(re_email_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, email_find_all_re): + d += 1 + doAssert d == 92 + doNotOptimizeAway(d) - bench(re_email_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, email_find_all_re): - d += 1 - doAssert d == 92 - doNotOptimizeAway(d) +const email_find_all = regex.re2"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" - const email_find_all = regex.re2"[\w\.+-]+@[\w\.-]+\.[\w\.-]+" +benchRelative(regex_email_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in regex.findAll(bench_text, email_find_all): + d += 1 + doAssert d == 92 + doNotOptimizeAway(d) - benchRelative(regex_email_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in regex.findAll(bench_text, email_find_all): - d += 1 - doAssert d == 92 - doNotOptimizeAway(d) +var uri_find_all_re = re.re"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" - var uri_find_all_re = re.re"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" +bench(re_uri_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, uri_find_all_re): + d += 1 + doAssert d == 5301 + doNotOptimizeAway(d) - bench(re_uri_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, uri_find_all_re): - d += 1 - doAssert d == 5301 - doNotOptimizeAway(d) +const uri_find_all = regex.re2"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" - const uri_find_all = regex.re2"[\w]+://[^/\s?#]+[^\s?#]+(?:\?[^\s#]*)?(?:#[^\s]*)?" +benchRelative(regex_uri_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in regex.findAll(bench_text, uri_find_all): + d += 1 + doAssert d == 5301 + doNotOptimizeAway(d) - benchRelative(regex_uri_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in regex.findAll(bench_text, uri_find_all): - d += 1 - doAssert d == 5301 - doNotOptimizeAway(d) +var ip_find_all_re = re.re"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" - var ip_find_all_re = re.re"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" +bench(re_ip_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in re.findAll(bench_text, ip_find_all_re): + d += 1 + doAssert d == 5 + doNotOptimizeAway(d) - bench(re_ip_find_all, m): - var d = 0 - for i in 0 ..< m: - for _ in re.findAll(bench_text, ip_find_all_re): - d += 1 - doAssert d == 5 - doNotOptimizeAway(d) +const ip_find_all = regex.re2"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" - const ip_find_all = regex.re2"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9])" +benchRelative(regex_ip_find_all, m): + var d = 0 + for i in 0 ..< m: + for _ in regex.findAll(bench_text, ip_find_all): + d += 1 + doAssert d == 5 + doNotOptimizeAway(d) - benchRelative(regex_ip_find_all, m): +when true: + bench(runes, m): var d = 0 for i in 0 ..< m: - for _ in regex.findAll(bench_text, ip_find_all): + for _ in bench_text.runes: d += 1 - doAssert d == 5 doNotOptimizeAway(d) - when true: - bench(runes, m): - var d = 0 - for i in 0 ..< m: - for _ in bench_text.runes: - d += 1 - doNotOptimizeAway(d) - - bench(dummy, m): - for i in 0 ..< m: - memoryClobber() +bench(dummy, m): + for i in 0 ..< m: + memoryClobber() - when isMainModule: - runBenchmarks() +when isMainModule: + runBenchmarks() - #[ - # Profiling: - # (but extract the bench to another module without nimbench) - # open the log with KCachegrind +#[ +# Profiling: +# (but extract the bench to another module without nimbench) +# open the log with KCachegrind - $ nim c --debugger:native --threads:off -d:danger -d:useMalloc -o:bin/bench2 bench/bench2.nim && valgrind --tool=callgrind -v ./bin/bench2 - ]# +$ nim c --debugger:native --threads:off -d:danger -d:useMalloc -o:bin/bench2 bench/bench2.nim && valgrind --tool=callgrind -v ./bin/bench2 +]# diff --git a/bench/bench2.nim b/bench/bench2.nim deleted file mode 100644 index 9f09372..0000000 --- a/bench/bench2.nim +++ /dev/null @@ -1,23 +0,0 @@ -import unicode -from regex import nil - -func genText(): string {.compileTime.} = - result = "" - for _ in 0 .. 100000: - result.add("a") - result.add("sol") - for _ in 0 .. 100000: - result.add("b") - #result.add("ฅ") -const text = genText() - -const pattern4 = regex.re2(r"\w*sol\w*") #, {regex.RegexFlag.reAscii}) - -proc runBenchmarks() = - var m2: regex.RegexMatch2 - for i in 0 ..< 500: - discard regex.match(text, pattern4, m2) - echo m2.captures - -when isMainModule: - runBenchmarks() \ No newline at end of file From bdfef55ce87488f4728aec5c89f7c344cedc6f8e Mon Sep 17 00:00:00 2001 From: nitely Date: Fri, 15 Sep 2023 18:19:02 -0300 Subject: [PATCH 9/9] remove todo --- src/regex/types.nim | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/regex/types.nim b/src/regex/types.nim index c275fff..3c62326 100644 --- a/src/regex/types.nim +++ b/src/regex/types.nim @@ -13,12 +13,6 @@ import ./common # XXX split nfatype.nim and nodetype.nim # once acyclic imports are supported -# XXX refactor transitions, add tIdx: int16 -# to Node, make TransitionsAll dense; -# remove z and store transition Nodes in -# the NFA; flatten TransitionsAll to seq[int16] -# + delimiter (-1'i16) or set first bit of -# every last tn idx type # exptype.nim