From 98e1d8226b6394df25fdadf4693f271c5fb4600f Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 15:36:16 -0300 Subject: [PATCH 1/6] pstates --- src/regex/nfafindall.nim | 6 +++--- src/regex/nfafindall2.nim | 6 +++--- src/regex/nfamacro.nim | 4 ++-- src/regex/nfamatch.nim | 18 +++++++++--------- src/regex/nfamatch2.nim | 20 ++++++++++---------- src/regex/nfatype.nim | 38 +++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/regex/nfafindall.nim b/src/regex/nfafindall.nim index 40bc480..8d41975 100644 --- a/src/regex/nfafindall.nim +++ b/src/regex/nfafindall.nim @@ -19,7 +19,7 @@ type s: seq[MatchItem] i: int RegexMatches* = object - a, b: Submatches + a, b: Pstates m: Matches c: Capts look: Lookaround @@ -52,8 +52,8 @@ template initMaybeImpl( ) = if ms.a == nil: assert ms.b == nil - ms.a = newSubmatches size - ms.b = newSubmatches size + ms.a = newPstates size + ms.b = newPstates size ms.look = initLook() doAssert ms.a.cap >= size and ms.b.cap >= size diff --git a/src/regex/nfafindall2.nim b/src/regex/nfafindall2.nim index 0507cc7..a199b5b 100644 --- a/src/regex/nfafindall2.nim +++ b/src/regex/nfafindall2.nim @@ -54,7 +54,7 @@ type bounds: Bounds Matches = seq[MatchItem] RegexMatches2* = object - a, b: Submatches + a, b: Pstates m: Matches c: Capts3 look: Lookaround @@ -65,8 +65,8 @@ template initMaybeImpl( ) = if ms.a == nil: assert ms.b == nil - ms.a = newSubmatches size - ms.b = newSubmatches size + ms.a = newPstates size + ms.b = newPstates size ms.c = initCapts3 groupsLen ms.look = initLook() doAssert ms.a.cap >= size and diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index 5af2fb8..0293eb1 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -590,8 +590,8 @@ proc matchImpl*(text, expLit, body: NimNode): NimNode = result = quote do: block: var - `smA` = newSubmatches `nfaLenLit` - `smB` = newSubmatches `nfaLenLit` + `smA` = newPstates `nfaLenLit` + `smB` = newPstates `nfaLenLit` `capts` = default(Capts) `capt` = -1'i32 `matched` = false diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index 7bc34f0..a324b69 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -10,7 +10,7 @@ import ./nfatype type AheadSig = proc ( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts, captIdx: var CaptIdx, text: string, @@ -20,7 +20,7 @@ type flags: set[MatchFlag] ): bool {.nimcall, noSideEffect, raises: [].} BehindSig = proc ( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts, captIdx: var CaptIdx, text: string, @@ -111,7 +111,7 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = swap smA, smB func matchImpl( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts, captIdx: var CaptIdx, text: string, @@ -151,7 +151,7 @@ func matchImpl( return smA.len > 0 func reversedMatchImpl( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts, captIdx: var CaptIdx, text: string, @@ -198,7 +198,7 @@ func reversedMatchImpl( return -1 func reversedMatchImpl*( - smA, smB: var Submatches, + smA, smB: var Pstates, text: string, nfa: Nfa, look: var Lookaround, @@ -223,8 +223,8 @@ func matchImpl*( ): bool = m.clear() var - smA = newSubmatches(regex.nfa.s.len) - smB = newSubmatches(regex.nfa.s.len) + smA = newPstates(regex.nfa.s.len) + smB = newPstates(regex.nfa.s.len) capts = default(Capts) capt = -1.CaptIdx look = initLook() @@ -241,8 +241,8 @@ func startsWithImpl*(text: string, regex: Regex, start: int): bool = # XXX optimize mfShortestMatch, mfNoCaptures template flags: untyped = {mfAnchored, mfShortestMatch, mfNoCaptures} var - smA = newSubmatches(regex.nfa.s.len) - smB = newSubmatches(regex.nfa.s.len) + smA = newPstates(regex.nfa.s.len) + smB = newPstates(regex.nfa.s.len) capts = default(Capts) capt = -1.CaptIdx look = initLook() diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index d66d7c6..63a2b73 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -10,7 +10,7 @@ import ./nfatype type AheadSig = proc ( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts3, captIdx: var CaptIdx, text: string, @@ -20,7 +20,7 @@ type flags: MatchFlags ): bool {.nimcall, noSideEffect, raises: [].} BehindSig = proc ( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts3, captIdx: var CaptIdx, text: string, @@ -120,7 +120,7 @@ func epsilonMatch*( discard func nextState( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts3, look: var Lookaround, text: string, @@ -167,7 +167,7 @@ func nextState( capts.recycle() func matchImpl( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts3, captIdx: var CaptIdx, text: string, @@ -211,7 +211,7 @@ func matchImpl( return smA.len > 0 func reversedMatchImpl( - smA, smB: var Submatches, + smA, smB: var Pstates, capts: var Capts3, captIdx: var CaptIdx, text: string, @@ -265,7 +265,7 @@ func reversedMatchImpl( return -1 func reversedMatchImpl*( - smA, smB: var Submatches, + smA, smB: var Pstates, text: string, nfa: Nfa, look: var Lookaround, @@ -295,8 +295,8 @@ func matchImpl*( m.clear() let flags = regex.flags.toMatchFlags + flags var - smA = newSubmatches(regex.nfa.s.len) - smB = newSubmatches(regex.nfa.s.len) + smA = newPstates(regex.nfa.s.len) + smB = newPstates(regex.nfa.s.len) capts = initCapts3(regex.groupsCount) captIdx = -1.CaptIdx look = initLook() @@ -323,8 +323,8 @@ func startsWithImpl2*( # XXX optimize mfShortestMatch, mfNoCaptures let flags = regex.flags.toMatchFlags + {mfAnchored, mfShortestMatch, mfNoCaptures} var - smA = newSubmatches(regex.nfa.s.len) - smB = newSubmatches(regex.nfa.s.len) + smA = newPstates(regex.nfa.s.len) + smB = newPstates(regex.nfa.s.len) capts = initCapts3(regex.groupsCount) captIdx = -1.CaptIdx look = initLook() diff --git a/src/regex/nfatype.nim b/src/regex/nfatype.nim index ff43a8f..dd2863d 100644 --- a/src/regex/nfatype.nim +++ b/src/regex/nfatype.nim @@ -23,7 +23,7 @@ const type # XXX int16 same as max parallel states or max regex len - # but it's used by PState and the old capts + # but it's used by Pstate and the old capts CaptIdx* = int32 Capts3* = object ## Seq of captures divided into blocks @@ -300,34 +300,34 @@ func clear*(m: var RegexMatch2) {.inline.} = type NodeIdx* = int16 Bounds* = Slice[int] - PState* = tuple + Pstate* = tuple ni: NodeIdx ci: CaptIdx bounds: Bounds - Submatches* = ref object + Pstates* = ref object ## Parallel states would be a better name. ## This is a sparse set - sx: seq[PState] + sx: seq[Pstate] ss: seq[int16] si: int16 -func newSubmatches*(size: int): Submatches {.inline.} = - result = new Submatches - result.sx = newSeq[PState](8) +func newPstates*(size: int): Pstates {.inline.} = + result = new Pstates + result.sx = newSeq[Pstate](8) result.ss = newSeq[int16](size) result.si = 0 when defined(release): {.push checks: off.} -func `[]`*(sm: Submatches, i: int): PState {.inline.} = +func `[]`*(sm: Pstates, i: int): Pstate {.inline.} = assert i < sm.si sm.sx[i] -func hasState*(sm: Submatches, n: int16): bool {.inline.} = +func hasState*(sm: Pstates, n: int16): bool {.inline.} = sm.ss[n] < sm.si and sm.sx[sm.ss[n]].ni == n -func add*(sm: var Submatches, item: PState) {.inline.} = +func add*(sm: var Pstates, item: Pstate) {.inline.} = assert(not sm.hasState(item.ni)) assert sm.si <= sm.sx.len if (sm.si == sm.sx.len).unlikely: @@ -336,25 +336,25 @@ func add*(sm: var Submatches, item: PState) {.inline.} = sm.ss[item.ni] = sm.si sm.si += 1'i16 -func len*(sm: Submatches): int {.inline.} = +func len*(sm: Pstates): int {.inline.} = sm.si -func clear*(sm: var Submatches) {.inline.} = +func clear*(sm: var Pstates) {.inline.} = sm.si = 0 -iterator items*(sm: Submatches): PState {.inline.} = +iterator items*(sm: Pstates): Pstate {.inline.} = for i in 0 .. sm.len-1: yield sm.sx[i] # does not work in Nim <= 0.20 -#iterator mitems*(sm: Submatches): var PState {.inline.} = +#iterator mitems*(sm: Pstates): var Pstate {.inline.} = # for i in 0 .. sm.len-1: # yield sm.sx[i] -func cap*(sm: Submatches): int {.inline.} = +func cap*(sm: Pstates): int {.inline.} = sm.ss.len -func setLen*(sm: var Submatches, size: int) {.inline.} = +func setLen*(sm: var Pstates, size: int) {.inline.} = sm.ss.setLen size when defined(release): @@ -364,7 +364,7 @@ when defined(release): # size seq to reduce allocations type SmLookaroundItem* = object - a, b: Submatches + a, b: Pstates SmLookaround* = object s: seq[SmLookaroundItem] i: int @@ -372,8 +372,8 @@ type func setLen*(item: var SmLookaroundItem, size: int) {.inline.} = if item.a == nil: doAssert item.b == nil - item.a = newSubmatches size - item.b = newSubmatches size + item.a = newPstates size + item.b = newPstates size else: doAssert item.b != nil item.a.setLen size From e4ee660addafde018e5b8476e91e59c5ec397517 Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 17:21:56 -0300 Subject: [PATCH 2/6] Parallel states refactor --- bench/bench.nim | 4 ++ src/regex/nfafindall.nim | 28 ++++++-------- src/regex/nfafindall2.nim | 30 +++++++-------- src/regex/nfamacro.nim | 25 +++++++------ src/regex/nfamatch.nim | 37 ++++++++++--------- src/regex/nfamatch2.nim | 35 ++++++++++-------- src/regex/nfatype.nim | 77 +++++++++++++++++++-------------------- 7 files changed, 119 insertions(+), 117 deletions(-) diff --git a/bench/bench.nim b/bench/bench.nim index 8585659..937676c 100644 --- a/bench/bench.nim +++ b/bench/bench.nim @@ -244,4 +244,8 @@ when isMainModule: # open the log with KCachegrind $ nim c --debugger:native --threads:off -d:danger -d:useMalloc -o:bin/bench2 bench/bench2.nim && valgrind --tool=callgrind -v ./bin/bench2 + +# Bench + +$ nim c -r --threads:off -d:danger --mm:arc -o:bin/bench bench/bench.nim ]# diff --git a/src/regex/nfafindall.nim b/src/regex/nfafindall.nim index 8d41975..4aa4f0c 100644 --- a/src/regex/nfafindall.nim +++ b/src/regex/nfafindall.nim @@ -46,22 +46,18 @@ func add(ms: var Matches, m: MatchItem) {.inline.} = func clear(ms: var Matches) {.inline.} = ms.i = 0 -template initMaybeImpl( +func initMaybeImpl( ms: var RegexMatches, size: int -) = - if ms.a == nil: - assert ms.b == nil - ms.a = newPstates size - ms.b = newPstates size - ms.look = initLook() - doAssert ms.a.cap >= size and - ms.b.cap >= size +) {.inline.} = + ms.a.reset size + ms.b.reset size + ms.look = initLook() -template initMaybeImpl( +func initMaybeImpl( ms: var RegexMatches, regex: Regex -) = +) {.inline.} = initMaybeImpl(ms, regex.nfa.s.len) func hasMatches(ms: RegexMatches): bool {.inline.} = @@ -130,7 +126,7 @@ func submatch( while nti < L: let isEoe = ntn.kind == reEoe let nt0 = nt - matched = not smB.hasState(nt) and + matched = nt notin smB and (ntn.match(c.Rune) or ntn.kind == reEoe) inc nti captx = capt @@ -158,10 +154,10 @@ func submatch( smA.clear() if not eoeFound: eoeFound = true - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) smi = -1 break - smB.add (nt0, captx, bounds.a .. i-1) + smB.add initPstate(nt0, captx, bounds.a .. i-1) inc smi swap smA, smB @@ -181,7 +177,7 @@ func findSomeImpl*( i = start.int iPrev = start.int optFlag = mfFindMatchOpt in flags - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) if start-1 in 0 .. text.len-1: cPrev = bwRuneAt(text, start-1).int32 while i < text.len: @@ -200,7 +196,7 @@ func findSomeImpl*( # else: # XXX clear captures if optFlag: return i - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) iPrev = i cPrev = c.int32 submatch(ms, text, regex, iPrev, cPrev, -1'i32) diff --git a/src/regex/nfafindall2.nim b/src/regex/nfafindall2.nim index a199b5b..e8d950b 100644 --- a/src/regex/nfafindall2.nim +++ b/src/regex/nfafindall2.nim @@ -59,23 +59,19 @@ type c: Capts3 look: Lookaround -template initMaybeImpl( +func initMaybeImpl( ms: var RegexMatches2, size, groupsLen: int -) = - if ms.a == nil: - assert ms.b == nil - ms.a = newPstates size - ms.b = newPstates size - ms.c = initCapts3 groupsLen - ms.look = initLook() - doAssert ms.a.cap >= size and - ms.b.cap >= size +) {.inline.} = + ms.a.reset(size) + ms.b.reset(size) + ms.c.reset(groupsLen) + ms.look = initLook() -template initMaybeImpl( +func initMaybeImpl( ms: var RegexMatches2, regex: Regex -) = +) {.inline.} = initMaybeImpl(ms, regex.nfa.s.len, regex.groupsCount) func add(ms: var RegexMatches2, m: MatchItem) {.inline.} = @@ -170,7 +166,7 @@ func nextState( while nti < L: let isEoe = ntn.kind == reEoe let nt0 = nt - matched = not smB.hasState(nt) and + matched = nt notin smB and (ntn.match(c.Rune) or ntn.kind == reEoe) inc nti captx = capt @@ -187,10 +183,10 @@ func nextState( smA.clear() if not eoeFound: eoeFound = true - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) smi = -1 break - smB.add (nt0, captx, bounds.a .. i-1) + smB.add initPstate(nt0, captx, bounds.a .. i-1) inc smi swap smA, smB capts.recycle() @@ -214,7 +210,7 @@ func findSomeImpl*( flags = regex.flags.toMatchFlags + flags optFlag = mfFindMatchOpt in flags binFlag = mfBytesInput in flags - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) if start-1 in 0 .. text.len-1: cPrev = if binFlag: text[start-1].int32 @@ -236,7 +232,7 @@ func findSomeImpl*( return i if optFlag: return i - smA.add (0'i16, -1.CaptIdx, i .. i-1) + smA.add initPstate(0'i16, -1.CaptIdx, i .. i-1) iPrev = i cPrev = c.int32 nextState(ms, text, regex, iPrev, cPrev, -1'i32, flags) diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index 0293eb1..d5bccc0 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -293,7 +293,7 @@ func genMatchedBody( let eTransitions = getEpsilonTransitions(nfa, n, nti) if eTransitions.len == 0: return quote do: - add(`smB`, (`ntLit`, `capt`, `bounds2`)) + add(`smB`, initPstate(`ntLit`, `capt`, `bounds2`)) var matchedBody = newSeq[NimNode]() matchedBody.add quote do: `matched` = true @@ -325,7 +325,7 @@ func genMatchedBody( doAssert false matchedBody.add quote do: if `matched`: - add(`smB`, (`ntLit`, `captx`, `bounds2`)) + add(`smB`, initPstate(`ntLit`, `captx`, `bounds2`)) return newStmtList matchedBody func genNextState( @@ -339,10 +339,10 @@ func genNextState( #[ case n of 0: - if not smB.hasState(1): + if not smB.contains(1): if c == 'a': smB.add((1, capt, bounds)) - if not smB.hasState(4): + if not smB.contains(4): if c == 'b': smB.add((4, capt, bounds)) of 1: @@ -384,11 +384,11 @@ func genNextState( i, nti, nfa, look, flags) if mfAnchored in flags and s[nt].kind == reEoe: branchBodyN.add quote do: - if not hasState(`smB`, `ntLit`): + if not contains(`smB`, `ntLit`): `matchedBodyStmt` else: branchBodyN.add quote do: - if not hasState(`smB`, `ntLit`) and `matchCond`: + if not contains(`smB`, `ntLit`) and `matchCond`: `matchedBodyStmt` doAssert eoeOnly or branchBodyN.len > 0 if branchBodyN.len > 0: @@ -418,7 +418,10 @@ func nextState( flags: set[MatchFlag], eoeOnly = false ): NimNode = - defForVars n, capt, bounds + defForVars pstate + let n = quote do: `pstate`.ni + let capt = quote do: `pstate`.ci + let bounds = quote do: `pstate`.bounds let eoeBailOut = if mfAnchored in flags: quote do: if `n` == `eoe`: @@ -433,7 +436,7 @@ func nextState( flags, eoeOnly) result = quote do: `smB`.clear() - for `n`, `capt`, `bounds` in `smA`.items: + for `pstate` in `smA`.items: `eoeBailOut` `nextStateStmt` swap `smA`, `smB` @@ -483,7 +486,7 @@ func matchImpl( if `start`-1 in 0 .. `text`.len-1: `cPrev` = bwRuneAt(`text`, `start`-1).int32 clear(`smA`) - add(`smA`, (0'i16, `captIdx`, `i` .. `i`-1)) + add(`smA`, initPstate(0'i16, `captIdx`, `i` .. `i`-1)) while `i` < `text`.len: fastRuneAt(`text`, iNext, `c`, true) `nextStateStmt` @@ -590,8 +593,8 @@ proc matchImpl*(text, expLit, body: NimNode): NimNode = result = quote do: block: var - `smA` = newPstates `nfaLenLit` - `smB` = newPstates `nfaLenLit` + `smA` = initPstates `nfaLenLit` + `smB` = initPstates `nfaLenLit` `capts` = default(Capts) `capt` = -1'i32 `matched` = false diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index a324b69..35f2a14 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -44,7 +44,7 @@ template lookAroundTpl*: untyped {.dirty.} = else: {mfAnchored} smL.grow() - smL.last.setLen zNfa.s.len + smL.last.reset zNfa.s.len matched = case ntn.kind of reLookahead: look.ahead( @@ -72,17 +72,20 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = when bwMatch: i .. bounds.b else: bounds.a .. i-1 template nt: untyped = nfa.s[n].next[nti] template ntn: untyped = nfa.s[nt] + template n: untyped = pstate.ni + template capt: untyped = pstate.ci + template bounds: untyped = pstate.bounds smB.clear() - for n, capt, bounds in items smA: + for pstate in items smA: if anchored and nfa.s[n].kind == reEoe: - if not smB.hasState n: - smB.add (n, capt, bounds) + if n notin smB: + smB.add initPstate(n, capt, bounds) break let L = nfa.s[n].next.len var nti = 0 while nti < L: let nt0 = nt - matched = not smB.hasState(nt) and + matched = nt notin smB and (ntn.match(c) or (anchored and ntn.kind == reEoe)) inc nti captx = capt @@ -107,7 +110,7 @@ template nextStateTpl(bwMatch = false): untyped {.dirty.} = discard inc nti if matched: - smB.add (nt0, captx, bounds2) + smB.add initPstate(nt0, captx, bounds2) swap smA, smB func matchImpl( @@ -131,7 +134,7 @@ func matchImpl( if start-1 in 0 .. text.len-1: cPrev = bwRuneAt(text, start-1).int32 smA.clear() - smA.add (0'i16, captIdx, i .. i-1) + smA.add initPstate(0'i16, captIdx, i .. i-1) while i < text.len: fastRuneAt(text, iNext, c, true) nextStateTpl() @@ -174,7 +177,7 @@ func reversedMatchImpl( if start in 0 .. text.len-1: cPrev = text.runeAt(start).int32 smA.clear() - smA.add (0'i16, captIdx, i .. i-1) + smA.add initPstate(0'i16, captIdx, i .. i-1) while iNext > limit: bwFastRuneAt(text, iNext, c) nextStateTpl(bwMatch = true) @@ -188,13 +191,13 @@ func reversedMatchImpl( if iNext > 0: bwFastRuneAt(text, iNext, c) nextStateTpl(bwMatch = true) - for n, capt, bounds in items smA: - if nfa.s[n].kind == reEoe: + for pstate in items smA: + if nfa.s[pstate.ni].kind == reEoe: if mfReverseCapts in flags: - captIdx = reverse(capts, capt, captIdx) + captIdx = reverse(capts, pstate.ci, captIdx) else: - captIdx = capt - return bounds.a + captIdx = pstate.ci + return pstate.bounds.a return -1 func reversedMatchImpl*( @@ -223,8 +226,8 @@ func matchImpl*( ): bool = m.clear() var - smA = newPstates(regex.nfa.s.len) - smB = newPstates(regex.nfa.s.len) + smA = initPstates(regex.nfa.s.len) + smB = initPstates(regex.nfa.s.len) capts = default(Capts) capt = -1.CaptIdx look = initLook() @@ -241,8 +244,8 @@ func startsWithImpl*(text: string, regex: Regex, start: int): bool = # XXX optimize mfShortestMatch, mfNoCaptures template flags: untyped = {mfAnchored, mfShortestMatch, mfNoCaptures} var - smA = newPstates(regex.nfa.s.len) - smB = newPstates(regex.nfa.s.len) + smA = initPstates(regex.nfa.s.len) + smB = initPstates(regex.nfa.s.len) capts = default(Capts) capt = -1.CaptIdx look = initLook() diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index 63a2b73..5d24d8a 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -53,7 +53,7 @@ func lookAround( if mfBytesInput in flags: flags2.incl mfBytesInput smL.grow() - smL.last.setLen subNfa.s.len + smL.last.reset subNfa.s.len result = case ntn.kind of reLookahead: look.ahead( @@ -136,22 +136,25 @@ func nextState( if bwMatch: i .. bounds.b else: bounds.a .. i-1 template nt: untyped = nfa[n].next[nti] template ntn: untyped = nfa[nt] + template n: untyped = pstate.ni + template capt: untyped = pstate.ci + template bounds: untyped = pstate.bounds let anchored = mfAnchored in flags var captx = 0.CaptIdx var matched = true smB.clear() - for n, capt, bounds in items smA: + for pstate in items smA: if capt != -1: capts.keepAlive capt if anchored and nfa[n].kind == reEoe: - if not smB.hasState n: - smB.add (n, capt, bounds) + if n notin smB: + smB.add initPstate(n, capt, bounds) break let L = nfa[n].next.len var nti = 0 while nti < L: let nt0 = nt - matched = not smB.hasState(nt) and + matched = nt notin smB and (ntn.match(c) or (anchored and ntn.kind == reEoe)) inc nti captx = capt @@ -162,7 +165,7 @@ func nextState( ) inc nti if matched: - smB.add (nt0, captx, bounds2) + smB.add initPstate(nt0, captx, bounds2) swap smA, smB capts.recycle() @@ -190,7 +193,7 @@ func matchImpl( else: bwRuneAt(text, start-1).int32 smA.clear() - smA.add (0'i16, captIdx, i .. i-1) + smA.add initPstate(0'i16, captIdx, i .. i-1) while i < text.len: if binFlag: c = text[iNext].Rune @@ -236,7 +239,7 @@ func reversedMatchImpl( else: runeAt(text, start).int32 smA.clear() - smA.add (0'i16, captIdx, i .. i-1) + smA.add initPstate(0'i16, captIdx, i .. i-1) while iNext > limit: if binFlag: c = text[iNext-1].Rune @@ -258,10 +261,10 @@ func reversedMatchImpl( else: bwFastRuneAt(text, iNext, c) nextState(smA, smB, capts, look, text, nfa, i, cPrev, c, flags, bwMatch = true) - for n, capt, bounds in items smA: - if nfa.s[n].kind == reEoe: - captIdx = capt - return bounds.a + for pstate in items smA: + if nfa.s[pstate.ni].kind == reEoe: + captIdx = pstate.ci + return pstate.bounds.a return -1 func reversedMatchImpl*( @@ -295,8 +298,8 @@ func matchImpl*( m.clear() let flags = regex.flags.toMatchFlags + flags var - smA = newPstates(regex.nfa.s.len) - smB = newPstates(regex.nfa.s.len) + smA = initPstates(regex.nfa.s.len) + smB = initPstates(regex.nfa.s.len) capts = initCapts3(regex.groupsCount) captIdx = -1.CaptIdx look = initLook() @@ -323,8 +326,8 @@ func startsWithImpl2*( # XXX optimize mfShortestMatch, mfNoCaptures let flags = regex.flags.toMatchFlags + {mfAnchored, mfShortestMatch, mfNoCaptures} var - smA = newPstates(regex.nfa.s.len) - smB = newPstates(regex.nfa.s.len) + smA = initPstates(regex.nfa.s.len) + smB = initPstates(regex.nfa.s.len) capts = initCapts3(regex.groupsCount) captIdx = -1.CaptIdx look = initLook() diff --git a/src/regex/nfatype.nim b/src/regex/nfatype.nim index dd2863d..19094d2 100644 --- a/src/regex/nfatype.nim +++ b/src/regex/nfatype.nim @@ -67,14 +67,21 @@ template fastLog2Tpl(x: Natural): untyped = else: fastLog2(x) +func reset*(capts: var Capts3, groupsLen: int) = + if groupsLen == 0: + return + if capts.groupsLen != groupsLen: + let blockSize = max(2, nextPowerOfTwo groupsLen) + capts.groupsLen = groupsLen + capts.blockSize = blockSize + capts.blockSizeL2 = fastLog2Tpl blockSize + capts.freezeId = stsFrozen.a + capts.s.setLen 0 + capts.states.setLen 0 + capts.free.setLen 0 + func initCapts3*(groupsLen: int): Capts3 = - let blockSize = max(2, nextPowerOfTwo groupsLen) - Capts3( - groupsLen: groupsLen, - blockSize: blockSize, - blockSizeL2: fastLog2Tpl blockSize, - freezeId: stsFrozen.a - ) + reset(result, groupsLen) func check(curr, next: CaptState): bool = ## Check if transition from state curr to next is allowed @@ -300,35 +307,39 @@ func clear*(m: var RegexMatch2) {.inline.} = type NodeIdx* = int16 Bounds* = Slice[int] - Pstate* = tuple - ni: NodeIdx - ci: CaptIdx - bounds: Bounds - Pstates* = ref object - ## Parallel states would be a better name. + Pstate* = object + ni*: NodeIdx + ci*: CaptIdx + bounds*: Bounds + Pstates* = object ## This is a sparse set sx: seq[Pstate] ss: seq[int16] si: int16 -func newPstates*(size: int): Pstates {.inline.} = - result = new Pstates - result.sx = newSeq[Pstate](8) - result.ss = newSeq[int16](size) - result.si = 0 +func initPstate*(ni: NodeIdx, ci: CaptIdx, bounds: Bounds): Pstate {.inline.} = + Pstate(ni: ni, ci: ci, bounds: bounds) when defined(release): {.push checks: off.} -func `[]`*(sm: Pstates, i: int): Pstate {.inline.} = +func reset*(sm: var Pstates, size: int) {.inline.} = + sm.sx.setLen 8 + sm.ss.setLen size + sm.si = 0 + +func initPstates*(size: int): Pstates {.inline.} = + reset(result, size) + +func `[]`*(sm: Pstates, i: int): lent Pstate {.inline.} = assert i < sm.si sm.sx[i] -func hasState*(sm: Pstates, n: int16): bool {.inline.} = +func contains*(sm: Pstates, n: int16): bool {.inline.} = sm.ss[n] < sm.si and sm.sx[sm.ss[n]].ni == n -func add*(sm: var Pstates, item: Pstate) {.inline.} = - assert(not sm.hasState(item.ni)) +func add*(sm: var Pstates, item: sink Pstate) {.inline.} = + assert(item.ni notin sm) assert sm.si <= sm.sx.len if (sm.si == sm.sx.len).unlikely: sm.sx.setLen(sm.sx.len * 2) @@ -342,21 +353,13 @@ func len*(sm: Pstates): int {.inline.} = func clear*(sm: var Pstates) {.inline.} = sm.si = 0 -iterator items*(sm: Pstates): Pstate {.inline.} = +iterator items*(sm: Pstates): lent Pstate {.inline.} = for i in 0 .. sm.len-1: yield sm.sx[i] -# does not work in Nim <= 0.20 -#iterator mitems*(sm: Pstates): var Pstate {.inline.} = -# for i in 0 .. sm.len-1: -# yield sm.sx[i] - func cap*(sm: Pstates): int {.inline.} = sm.ss.len -func setLen*(sm: var Pstates, size: int) {.inline.} = - sm.ss.setLen size - when defined(release): {.pop.} @@ -369,15 +372,9 @@ type s: seq[SmLookaroundItem] i: int -func setLen*(item: var SmLookaroundItem, size: int) {.inline.} = - if item.a == nil: - doAssert item.b == nil - item.a = newPstates size - item.b = newPstates size - else: - doAssert item.b != nil - item.a.setLen size - item.b.setLen size +func reset*(item: var SmLookaroundItem, size: int) {.inline.} = + item.a.reset size + item.b.reset size template last*(sm: SmLookaround): untyped = sm.s[sm.i-1] From 42bb420f6f590da3fe25d610978d940365341785 Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 17:43:21 -0300 Subject: [PATCH 3/6] wip --- src/regex/nfamacro.nim | 8 ++++---- src/regex/nfatype.nim | 10 ++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index d5bccc0..e984150 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -263,7 +263,7 @@ func genLookaroundMatch( let nfaLenLit = newLit nfa.s.len result = quote do: grow `smL` - `smL`.last.setLen `nfaLenLit` + `smL`.last.reset `nfaLenLit` `lookaroundStmt` removeLast `smL` @@ -425,8 +425,8 @@ func nextState( let eoeBailOut = if mfAnchored in flags: quote do: if `n` == `eoe`: - if not hasState(`smB`, `n`): - add(`smB`, (`n`, `capt`, `bounds`)) + if not contains(`smB`, `n`): + add(`smB`, initPstate(`n`, `capt`, `bounds`)) break else: newEmptyNode() @@ -537,7 +537,7 @@ func reversedMatchImpl( if `start` in 0 .. `text`.len-1: `cPrev` = runeAt(`text`, `start`).int32 clear(`smA`) - add(`smA`, (0'i16, `captIdx`, `i` .. `i`-1)) + add(`smA`, initPstate(0'i16, `captIdx`, `i` .. `i`-1)) while iNext > 0: bwFastRuneAt(`text`, iNext, `c`) `nextStateStmt` diff --git a/src/regex/nfatype.nim b/src/regex/nfatype.nim index 19094d2..0d2a8a5 100644 --- a/src/regex/nfatype.nim +++ b/src/regex/nfatype.nim @@ -68,17 +68,15 @@ template fastLog2Tpl(x: Natural): untyped = fastLog2(x) func reset*(capts: var Capts3, groupsLen: int) = - if groupsLen == 0: - return + capts.freezeId = stsFrozen.a + capts.s.setLen 0 + capts.states.setLen 0 + capts.free.setLen 0 if capts.groupsLen != groupsLen: let blockSize = max(2, nextPowerOfTwo groupsLen) capts.groupsLen = groupsLen capts.blockSize = blockSize capts.blockSizeL2 = fastLog2Tpl blockSize - capts.freezeId = stsFrozen.a - capts.s.setLen 0 - capts.states.setLen 0 - capts.free.setLen 0 func initCapts3*(groupsLen: int): Capts3 = reset(result, groupsLen) From 580f5849001b2b3d5bcdfb1da35e3fe89a5c7f6f Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 21:06:02 -0300 Subject: [PATCH 4/6] wip --- src/regex/nfamacro.nim | 21 ++++++++------------- src/regex/nfamatch.nim | 10 +++------- src/regex/nfamatch2.nim | 12 +++++------- src/regex/nfatype.nim | 32 -------------------------------- 4 files changed, 16 insertions(+), 59 deletions(-) diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index e984150..5a6839d 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -38,7 +38,6 @@ type ): NimNode {.nimcall, noSideEffect, raises: [].} Lookaround = object ahead, behind: Sig - smL: NimNode # todo: can not use unicodeplus due to # https://github.com/nim-lang/Nim/issues/7059 @@ -240,9 +239,7 @@ func genLookaroundMatch( look: Lookaround ): NimNode = template nfa: untyped = n.subExp.nfa - template smL: untyped = look.smL - let smlA = quote do: lastA(`smL`) - let smlB = quote do: lastB(`smL`) + defVars smlA, smlB var flags = {mfAnchored} if n.subExp.reverseCapts: flags.incl mfReverseCapts @@ -262,10 +259,9 @@ func genLookaroundMatch( `matched` = not `matched` let nfaLenLit = newLit nfa.s.len result = quote do: - grow `smL` - `smL`.last.reset `nfaLenLit` + var `smlA` = initPstates(`nfaLenLit`) + var `smlB` = initPstates(`nfaLenLit`) `lookaroundStmt` - removeLast `smL` func getEpsilonTransitions(nfa: Nfa, n: Node, nti: int): seq[int] = doAssert not isEpsilonTransition(n) @@ -554,11 +550,11 @@ func reversedMatchImpl( `captsStmt` `matched` = `smA`.len > 0 -template look(smL: NimNode): untyped = +template look: untyped = Lookaround( ahead: matchImpl, - behind: reversedMatchImpl, - smL: smL) + behind: reversedMatchImpl + ) template constructSubmatches2( captures, txt, capts, capt, size: untyped @@ -581,13 +577,13 @@ proc matchImpl*(text, expLit, body: NimNode): NimNode = if not (expLit.kind == nnkCallStrLit and $expLit[0] == "rex"): error "not a regex literal; only rex\"regex\" is allowed", expLit let exp = expLit[1] - defVars smA, smB, capts, capt, matched, smL + defVars smA, smB, capts, capt, matched let regex = reCt(exp.strVal) let startLit = newLit 0 let flags: set[MatchFlag] = {} let matchImplStmt = matchImpl( smA, smB, capts, capt, matched, - text, startLit, regex.nfa, look(smL), flags) + text, startLit, regex.nfa, look(), flags) let nfaLenLit = newLit regex.nfa.s.len let nfaGroupsLen = int(regex.groupsCount) result = quote do: @@ -598,7 +594,6 @@ proc matchImpl*(text, expLit, body: NimNode): NimNode = `capts` = default(Capts) `capt` = -1'i32 `matched` = false - `smL` {.used.} = default(SmLookaround) `matchImplStmt` if `matched`: var matches {.used, inject.} = newSeq[string]() diff --git a/src/regex/nfamatch.nim b/src/regex/nfamatch.nim index 35f2a14..9df4bfa 100644 --- a/src/regex/nfamatch.nim +++ b/src/regex/nfamatch.nim @@ -32,19 +32,16 @@ type Lookaround* = object ahead*: AheadSig behind*: BehindSig - smL*: SmLookaround + #smL*: SmLookaround template lookAroundTpl*: untyped {.dirty.} = - template smL: untyped = look.smL - template smLa: untyped = smL.lastA - template smLb: untyped = smL.lastB template zNfa: untyped = ntn.subExp.nfa let flags2 = if ntn.subExp.reverseCapts: {mfAnchored, mfReverseCapts} else: {mfAnchored} - smL.grow() - smL.last.reset zNfa.s.len + var smLa = initPstates(zNfa.s.len) + var smLb = initPstates(zNfa.s.len) matched = case ntn.kind of reLookahead: look.ahead( @@ -65,7 +62,6 @@ template lookAroundTpl*: untyped {.dirty.} = else: doAssert false false - smL.removeLast() template nextStateTpl(bwMatch = false): untyped {.dirty.} = template bounds2: untyped = diff --git a/src/regex/nfamatch2.nim b/src/regex/nfamatch2.nim index 5d24d8a..9abe2aa 100644 --- a/src/regex/nfamatch2.nim +++ b/src/regex/nfamatch2.nim @@ -32,7 +32,7 @@ type Lookaround* = object ahead*: AheadSig behind*: BehindSig - smL*: SmLookaround + #smL*: SmLookaround func lookAround( ntn: Node, @@ -43,17 +43,16 @@ func lookAround( start: int, flags: MatchFlags ): bool = - template smL: untyped = look.smL - template smLa: untyped = smL.lastA - template smLb: untyped = smL.lastB template subNfa: untyped = ntn.subExp.nfa var flags2 = {mfAnchored} if ntn.subExp.reverseCapts: flags2.incl mfReverseCapts if mfBytesInput in flags: flags2.incl mfBytesInput - smL.grow() - smL.last.reset subNfa.s.len + # XXX store lookaround number + count, and use a fixed + # size seq to reduce allocations; use look.smL + var smLa = initPstates(subNfa.s.len) + var smLb = initPstates(subNfa.s.len) result = case ntn.kind of reLookahead: look.ahead( @@ -74,7 +73,6 @@ func lookAround( else: doAssert false false - smL.removeLast() func epsilonMatch*( matched: var bool, diff --git a/src/regex/nfatype.nim b/src/regex/nfatype.nim index 0d2a8a5..d180d67 100644 --- a/src/regex/nfatype.nim +++ b/src/regex/nfatype.nim @@ -361,38 +361,6 @@ func cap*(sm: Pstates): int {.inline.} = when defined(release): {.pop.} -# XXX maybe store the lookaround number + count, and use a fixed -# size seq to reduce allocations -type - SmLookaroundItem* = object - a, b: Pstates - SmLookaround* = object - s: seq[SmLookaroundItem] - i: int - -func reset*(item: var SmLookaroundItem, size: int) {.inline.} = - item.a.reset size - item.b.reset size - -template last*(sm: SmLookaround): untyped = - sm.s[sm.i-1] - -template lastA*(sm: SmLookaround): untyped = - last(sm).a - -template lastB*(sm: SmLookaround): untyped = - last(sm).b - -func grow*(sm: var SmLookaround) {.inline.} = - doAssert sm.i <= sm.s.len - if sm.i == sm.s.len: - sm.s.setLen(max(1, sm.s.len) * 2) - sm.i += 1 - -func removeLast*(sm: var SmLookaround) {.inline.} = - doAssert sm.i > 0 - sm.i -= 1 - when isMainModule: func `[]=`(capts: var Capts3, i, j: Natural, x: Slice[int]) = doAssert i <= capts.len-1 From b15294dd72639e3eb5c414eba956b148c6fc3442 Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 21:37:40 -0300 Subject: [PATCH 5/6] afdrwqfdsf --- src/regex/nfatype.nim | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/regex/nfatype.nim b/src/regex/nfatype.nim index d180d67..3fd7069 100644 --- a/src/regex/nfatype.nim +++ b/src/regex/nfatype.nim @@ -309,7 +309,8 @@ type ni*: NodeIdx ci*: CaptIdx bounds*: Bounds - Pstates* = object + # XXX this is a ref because of Nim JS bugs; it works in +2.2.0 + Pstates* = ref object ## This is a sparse set sx: seq[Pstate] ss: seq[int16] @@ -322,6 +323,8 @@ when defined(release): {.push checks: off.} func reset*(sm: var Pstates, size: int) {.inline.} = + if sm == nil: + sm = Pstates() sm.sx.setLen 8 sm.ss.setLen size sm.si = 0 From e52701173c17104ffc9b5b00c8958183203cfa02 Mon Sep 17 00:00:00 2001 From: nitely Date: Tue, 11 Feb 2025 21:46:22 -0300 Subject: [PATCH 6/6] ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe0444f..4e2c11f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - nim: [1.6.18, 2.0.0, 2.2.0] + nim: [1.6.18, 1.6.20, 2.0.0, 2.0.14, 2.2.0] steps: - uses: actions/checkout@v2 - name: Run Tests