From 40d069e464649c2c8936aa35fcf39d31cbd0c51f Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 20:44:57 +0800 Subject: [PATCH 1/8] docs(plans): sticky pool failover plan --- docs/plans/20260518-sticky-failover.md | 103 +++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 docs/plans/20260518-sticky-failover.md diff --git a/docs/plans/20260518-sticky-failover.md b/docs/plans/20260518-sticky-failover.md new file mode 100644 index 0000000..50e119f --- /dev/null +++ b/docs/plans/20260518-sticky-failover.md @@ -0,0 +1,103 @@ +# Sticky Pool Failover (no "main" account) + +## Problem + +Live on knuth: `pool openai_pool failed over openai_oauth -> openai_oauth_2 (429)` repeats +every few minutes (20+ messages). Hermes keeps working. Root cause is a flap: + +- `openai_oauth` is position 0 (the de-facto "main"). Its OpenAI quota is exhausted, so it + 429s on every request. +- On 429 sluice fails over to `openai_oauth_2` and parks `openai_oauth` for only + `vault.RateLimitCooldown` = 60s. +- `PoolResolver.ResolveActive` picks the **first member in position order** that is healthy + or whose cooldown expired. 60s later `openai_oauth`'s cooldown lapses, so it is re-selected + even though `openai_oauth_2` was serving fine. +- `openai_oauth` is still quota-exhausted upstream -> 429 -> failover again -> another + identical Telegram notice. Repeats on every request gap > 60s. + +The real OpenAI Codex quota window is hours/weekly, so re-probing the exhausted account +every 60s and snapping back to it is wrong. + +## Desired behavior (user decision) + +Sticky failover: there is no "main" account. Stay on whichever member is currently active +until **that** member exhausts, then advance to the next member. A lower-position member +recovering from cooldown must NOT cause a switch back to it. A new failover (and its single +Telegram notice) happens only on a genuine exhaustion transition, not on cooldown lapse. + +Mode toggle (position-priority vs sticky as selectable strategies) is explicitly out of +scope for this change; sticky becomes the behavior. Note it as a possible follow-up. + +## Design + +`ResolveActive` is the single source of truth: `internal/proxy/pool_failover.go` calls +`pr.ResolveActive(pool)` to compute the new active ("to") after cooling the old member, so +making selection sticky fixes the flap, the failover events, and the notification spam in +one place. + +1. Add a per-pool current-active pointer to the swap-surviving shared `PoolHealth` + (`internal/vault/pool.go`), guarded by the same mutex as the cooldown map, so it + survives `NewPoolResolverShared` regeneration and atomic swaps (same lifecycle as + cooldowns; carry it the way cooldowns are carried). +2. Sticky `ResolveActive(pool)`: + - Let `cur` = shared current-active for the pool, if set and still a member of this + generation. + - If `cur` is set and healthy (no active cooldown) -> return `cur` (sticky hold). + - Else pick the next eligible member by position **starting after `cur`'s position and + wrapping** (so we advance forward, never snap back to position 0); skip members in + active cooldown; treat `ManualRotateReason` parks with the existing + better-degrade-target semantics. Set shared current-active to the picked member and + return it. + - If every member is cooling: keep the existing degrade behavior (operator-parked-but- + healthy first, else soonest-recovering) and do NOT move the sticky pointer, so when a + member recovers the next call advances to a healthy one rather than the absolute + position-0 member. +3. Operator `sluice pool rotate` semantics unchanged at the surface: it still parks the + current active so the next `ResolveActive` advances; with sticky that advance lands on + the next member and stays there (no snap-back), which is the intended rotate behavior. +4. Keep all existing concurrency invariants: sticky pointer reads/writes under the same + `PoolHealth.mu`; never lost across atomic pointer swap; a stale resolver generation must + not clobber it (mirror the cooldown CRITICAL-1 handling). + +## Out of scope + +- Strategy/mode toggle across CLI/REST/Telegram + schema (`credential_pools.strategy`) — + follow-up only; note the synergy, do not build. +- Honoring upstream `Retry-After` / changing `RateLimitCooldown` constant — sticky makes + the short cooldown harmless (we no longer re-probe the cooled member until forced), so a + cooldown-duration change is unnecessary for this fix. + +## Testing strategy + +- Unit (vault): sticky hold — active member stays selected across many `ResolveActive` + calls while a lower-position member is healthy. +- Unit (vault): flap regression — fail member A (cooldown), `ResolveActive` returns B; A's + cooldown expires; `ResolveActive` still returns B (no snap-back). B then cools -> + advance to next, wrapping. +- Unit (vault): all-cooling degrade unchanged; operator-parked degrade-target preserved. +- Unit (vault): sticky pointer survives `NewPoolResolverShared` regeneration + swap, and a + stale generation cannot clobber it (extend the existing CRITICAL-1 style test). +- Unit (proxy): the failover path emits exactly ONE `cred_failover` + one notice per real + exhaustion transition, and emits NOTHING when a non-active member's cooldown merely + lapses (the spam regression, fail-before/pass-after). +- Full `go test ./...`, `-race` on `internal/vault` and `internal/proxy`, gofumpt, + golangci-lint, `go vet ./...` and `-tags=e2e ./e2e/`. + +## Steps + +### Task 1: Sticky selection in vault.PoolResolver +- [ ] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex) +- [ ] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics +- [ ] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe) +- [ ] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival +- [ ] `go test ./internal/vault/ -race`, gofumpt, vet + +### Task 2: Failover path + notification spam regression +- [ ] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive` +- [ ] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after) +- [ ] `go test ./internal/proxy/ -race`, gofumpt, vet + +### Task 3: Docs + final validation +- [ ] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up +- [ ] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/` +- [ ] Independently verify committed HEAD builds and tests pass (do not trust subagent green) From 8f4399485cf27172217a0d8fe05a796c21eb8160 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 20:55:04 +0800 Subject: [PATCH 2/8] fix(vault): sticky pool failover selection (no main account) --- internal/vault/pool.go | 128 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 15 deletions(-) diff --git a/internal/vault/pool.go b/internal/vault/pool.go index 2ecb084..7a3f5e9 100644 --- a/internal/vault/pool.go +++ b/internal/vault/pool.go @@ -96,13 +96,30 @@ type PoolHealth struct { // same-named successor) so the stale write must NOT apply. Mutated only // under mu, the same lock the cooldown map uses. currentMembers map[string]memberIdentity + // active maps a pool name -> the credential ResolveActive last settled + // on (the sticky pointer). It lives here, NOT on PoolResolver, for the + // exact same CRITICAL-1 reason cooldowns do: a membership change / SIGHUP + // / 2s data_version watcher rebuilds a fresh PoolResolver that the server + // atomically pointer-swaps, and ResolveActive on the new generation must + // keep serving the member it switched to instead of snapping back to + // position 0 (the flap that spams cred_failover + Telegram). Reads and + // writes are under the SAME mu as the cooldown map (no second lock, no + // lock-ordering hazard). It survives swaps because every generation + // shares one *PoolHealth; a stale generation cannot clobber it because + // ResolveActive only ever writes a member of THIS generation's member + // list, and the MergeLiveCooldowns / SetCurrentMembers prune drops the + // pointer for any pool no longer present (mirrors the cooldown prune). + active map[string]string } // NewPoolHealth returns an empty shared health map. Call this exactly once // per process and thread the result through every NewPoolResolver so all // resolver generations share one cooldown view. func NewPoolHealth() *PoolHealth { - return &PoolHealth{health: make(map[string]memberHealth)} + return &PoolHealth{ + health: make(map[string]memberHealth), + active: make(map[string]string), + } } // SetCurrentMembers atomically replaces the authoritative member set for the @@ -287,11 +304,32 @@ func (pr *PoolResolver) Members(pool string) []string { // ResolveActive expands a name to the credential that should actually be // used. For a plain credential (not a pool) the name is returned unchanged. -// For a pool, the first member that is healthy or whose cooldown has expired -// (in position order) is returned. If every member is still cooling down, -// the member with the soonest recovery is returned and a WARNING is logged -// (degraded: sluice keeps serving with the least-bad account rather than -// failing the request outright). +// +// For a pool the selection is STICKY (there is no "main" / position-0 +// account): +// +// - If a current-active member is recorded for this pool, is still a +// member of THIS generation, and is healthy (no active cooldown), it is +// returned unchanged — a sticky hold. A lower-position member recovering +// from cooldown does NOT cause a switch back to it (this is the flap fix: +// re-probing an exhausted-upstream member every 60s and snapping back to +// it was wrong and spammed cred_failover + Telegram). +// - Otherwise the next eligible member is chosen by position order +// STARTING AFTER the current-active member's position and WRAPPING +// (advance forward, never snap back to position 0). Members in active +// cooldown are skipped. The chosen member becomes the new sticky +// current-active for the pool. +// - If EVERY member is cooling, sluice keeps serving with the least-bad +// account (degraded): an operator-parked-but-healthy member +// (ManualRotateReason) is preferred over a genuinely failed one, else +// the soonest-recovering member; a WARNING is logged. The sticky pointer +// is NOT moved in this case, so when a member recovers the next call +// advances forward to it rather than snapping to absolute position 0. +// +// The current-active pointer lives on the shared *PoolHealth and is read/ +// written under the same mu as the cooldown map, so it survives resolver +// pointer swaps (CRITICAL-1) and a stale generation cannot clobber it: a +// write only ever stores a member of THIS generation's member list. func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { if pr == nil { return name, true @@ -306,20 +344,58 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { return "", false } - // Read the shared health map under its RLock. A concurrent failover's - // MarkCooldown takes the same map's write lock, so this observes a - // consistent cooldown view regardless of resolver generation. - pr.health.mu.RLock() - defer pr.health.mu.RUnlock() + // Take the shared map's WRITE lock: the sticky pointer is mutated here + // when the active member is unhealthy/unset, and it must be consistent + // with the cooldown view a concurrent failover's MarkCooldown writes + // (same mu, no second lock, no lock-ordering hazard). + pr.health.mu.Lock() + defer pr.health.mu.Unlock() now := time.Now() - var soonest, soonestParked string - var soonestUntil, soonestParkedUntil time.Time - for _, m := range members { + + cooling := func(m string) bool { h, tracked := pr.health.health[m] - if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) { + return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now) + } + + // Position of the current sticky member in THIS generation's member + // list. startIdx == -1 (no valid current) makes the scan start at + // position 0; otherwise it starts AFTER cur and wraps. + startIdx := -1 + if cur, set := pr.health.active[name]; set { + for i, m := range members { + if m != cur { + continue + } + // Sticky hold: the recorded active member is still a member of + // this generation and is healthy — keep serving it. Do not move. + if !cooling(cur) { + return cur, true + } + startIdx = i + break + } + } + + // Advance forward from AFTER the current member (or position 0 when + // there is no valid current), wrapping, picking the first non-cooling + // member. Never snap back to position 0 on a flap. + n := len(members) + for off := 1; off <= n; off++ { + idx := (startIdx + off) % n + m := members[idx] + if !cooling(m) { + pr.health.active[name] = m return m, true } + } + + // Every member is cooling: degrade (do NOT move the sticky pointer, so a + // recovery advances forward rather than snapping to position 0). + var soonest, soonestParked string + var soonestUntil, soonestParkedUntil time.Time + for _, m := range members { + h := pr.health.health[m] if soonest == "" || h.cooldownUntil.Before(soonestUntil) { soonest = m soonestUntil = h.cooldownUntil @@ -515,6 +591,28 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) { cm[cred] = id } pr.health.currentMembers = cm + // Sticky-pointer prune (mirrors the cooldown prune above): drop the + // recorded active member for any pool this generation no longer has, + // or whose recorded member is no longer in that pool. A surviving + // pool keeps its sticky member so a benign reload does NOT snap it + // back to position 0 (the whole point of CRITICAL-1 for the pointer). + for poolName, cur := range pr.health.active { + poolMembers, stillPool := pr.pools[poolName] + if !stillPool { + delete(pr.health.active, poolName) + continue + } + stillMember := false + for _, m := range poolMembers { + if m == cur { + stillMember = true + break + } + } + if !stillMember { + delete(pr.health.active, poolName) + } + } pr.health.mu.Unlock() return } From e1b7ad1c46cc6899accbb590f240be4843e5de11 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 20:55:11 +0800 Subject: [PATCH 3/8] test(pool): sticky-hold, flap, advance-wrap, swap-survival and spam regressions --- internal/proxy/pool_failover_test.go | 107 ++++++++++++++++++++- internal/vault/pool_test.go | 139 ++++++++++++++++++++++++++- 2 files changed, 240 insertions(+), 6 deletions(-) diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go index b039a61..652c984 100644 --- a/internal/proxy/pool_failover_test.go +++ b/internal/proxy/pool_failover_test.go @@ -193,11 +193,23 @@ func TestFailoverCooldownTTLAndLazyRecovery(t *testing.T) { t.Fatalf("auth-fail cooldown TTL = %v, want ~%v", gotTTL, vault.AuthFailCooldown) } - // Lazy recovery: force the cooldown to the past; ResolveActive must - // treat memA as eligible again with no background scheduler involved. + // Lazy recovery: force the cooldown to the past. memA becomes ELIGIBLE + // again with no background scheduler involved (CooldownUntil reports it + // is no longer cooling). Selection is STICKY, so memB - which we failed + // over to and which is healthy - remains active: a recovered lower- + // position member must NOT snap back (the flap fix). memA's eligibility + // only matters as a future advance target if memB later exhausts. pr.MarkCooldown("memA", time.Now().Add(-time.Second), "expired") + if _, cooling := pr.CooldownUntil("memA"); cooling { + t.Fatal("after expiry memA must no longer be cooling (lazy recovery, no scheduler)") + } + if active, _ := pr.ResolveActive("codex_pool"); active != "memB" { + t.Fatalf("after expiry active = %q, want memB (sticky: recovered memA must NOT snap back)", active) + } + // Confirm memA IS the advance target once memB exhausts (wrap forward). + pr.MarkCooldown("memB", time.Now().Add(vault.RateLimitCooldown), "429") if active, _ := pr.ResolveActive("codex_pool"); active != "memA" { - t.Fatalf("after expiry active = %q, want memA (lazy recovery)", active) + t.Fatalf("after memB exhausts active = %q, want memA (advance forward to recovered member)", active) } } @@ -346,6 +358,95 @@ func TestFailoverAuditEvent(t *testing.T) { } } +// TestStickyFailoverNoSpamOnNonActiveCooldownLapse is the notification-spam +// regression for the live knuth flap. memA (position 0) is upstream- +// exhausted. Sequence: +// +// 1. memA 429 -> exactly ONE cred_failover (memA->memB) + one notice. +// 2. memA's short cooldown lapses. Under the OLD position-priority +// ResolveActive, the next request's injection would re-select memA +// (lower position, no longer cooling), hit the still-exhausted upstream, +// 429 again, and emit ANOTHER identical failover + Telegram notice - +// repeating every cooldown window (the 20+ message spam). +// 3. With sticky selection memB stays active after memA recovers, so the +// subsequent request resolves to memB, succeeds (200), and produces NO +// additional failover event or notice. +// +// Fail-before: step 3 would record a 2nd cred_failover (the flap). +// Pass-after: exactly one cred_failover total, one notice total. +func TestStickyFailoverNoSpamOnNonActiveCooldownLapse(t *testing.T) { + dir := t.TempDir() + logPath := filepath.Join(dir, "audit.log") + logger, err := audit.NewFileLogger(logPath) + if err != nil { + t.Fatalf("NewFileLogger: %v", err) + } + t.Cleanup(func() { _ = logger.Close() }) + + addon, _, prPtr := setupPoolAddon(t, "memA", "memB") + addon.auditLog = logger + client := setupAddonConn(addon, "auth.example.com:443") + pr := prPtr.Load() + + var notices int + addon.SetOnFailover(func(FailoverEvent) { notices++ }) + + if got, _ := pr.ResolveActive("codex_pool"); got != "memA" { + t.Fatalf("pre = %q, want memA", got) + } + + // 1. memA exhausts -> one real failover to memB. + f1 := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`)) + addon.flowInjected.Tag(f1.Id, "memA") + addon.Response(f1) + if got, _ := pr.ResolveActive("codex_pool"); got != "memB" { + t.Fatalf("after 429 active = %q, want memB", got) + } + + // 2. memA's short cooldown lapses (still upstream-exhausted in reality). + pr.MarkCooldown("memA", time.Now().Add(-time.Second), "expired") + + // The next request's injection resolves the pool. STICKY: it must stay + // on memB, NOT snap back to the recovered-but-still-exhausted memA. This + // is the single source of truth that kills the flap. + if got, _ := pr.ResolveActive("codex_pool"); got != "memB" { + t.Fatalf("FLAP: after memA cooldown lapse, injection resolves %q, want memB", got) + } + + // 3. The subsequent legitimate request therefore hits memB and succeeds; + // a 200 is a documented no-op (no failover, no notice). + f2 := newPoolRespFlow(client, 200, []byte(`{"ok":true}`)) + addon.flowInjected.Tag(f2.Id, "memB") + addon.Response(f2) + + if err := logger.Close(); err != nil { + t.Fatalf("logger close: %v", err) + } + data, err := os.ReadFile(logPath) + if err != nil { + t.Fatalf("read audit log: %v", err) + } + failovers := 0 + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + if line == "" { + continue + } + var evt audit.Event + if uerr := json.Unmarshal([]byte(line), &evt); uerr != nil { + t.Fatalf("unmarshal %q: %v", line, uerr) + } + if evt.Action == "cred_failover" || evt.Action == "pool_exhausted" { + failovers++ + } + } + if failovers != 1 { + t.Fatalf("got %d failover audit events, want exactly 1 (sticky must not flap on non-active cooldown lapse)\n%s", failovers, data) + } + if notices != 1 { + t.Fatalf("got %d failover notices, want exactly 1 (no Telegram spam on cooldown lapse)", notices) + } +} + // TestPoolForResponseResolvesActiveMember sanity-checks the destination -> // pool reverse mapping used by handlePoolFailover. func TestPoolForResponseResolvesActiveMember(t *testing.T) { diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index fd61a2f..99acfa3 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -98,6 +98,137 @@ func TestMarkCooldownLegacyUnscopedStillGated(t *testing.T) { } } +// TestResolveActiveStickyHold: once a member is selected it keeps being +// returned across many ResolveActive calls while it is healthy, even though a +// lower-position member is also healthy. (Fail-before: old position-priority +// always returned position-0 "a".) +func TestResolveActiveStickyHold(t *testing.T) { + pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil) + // First resolution settles on "a" (position 0, both healthy). + if got, _ := pr.ResolveActive("pool"); got != "a" { + t.Fatalf("initial = %q, want a", got) + } + // "a" cools -> switch to "b". + pr.MarkCooldown("a", time.Now().Add(60*time.Second), "429") + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("after cooling a = %q, want b", got) + } + // "a" recovers, but "b" is healthy: sticky hold across many calls. + pr.MarkCooldown("a", time.Time{}, "") + for i := 0; i < 25; i++ { + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("call %d: sticky hold broke, got %q want b (lower-position a recovered must NOT snap back)", i, got) + } + } +} + +// TestResolveActiveFlapRegression is the core flap fix. Sequence mirrors the +// live knuth bug: A (position 0) is upstream-exhausted. Fail A (cooldown) -> +// ResolveActive returns B. A's cooldown EXPIRES -> ResolveActive STILL returns +// B (no snap-back, so A is not re-probed every 60s and no spurious failover). +// Then B itself cools -> advance to the next member WITH WRAP (back to A, +// which has recovered). Fail-before (position-priority): step 3 would return +// A, the flap. +func TestResolveActiveFlapRegression(t *testing.T) { + pr := NewPoolResolver([]store.Pool{mkPool("pool", "A", "B", "C")}, nil) + if got, _ := pr.ResolveActive("pool"); got != "A" { + t.Fatalf("initial = %q, want A", got) + } + // 1. A exhausts -> failover to B. + pr.MarkCooldown("A", time.Now().Add(60*time.Second), "429") + if got, _ := pr.ResolveActive("pool"); got != "B" { + t.Fatalf("after cooling A = %q, want B", got) + } + // 2. A's short cooldown lapses (still upstream-exhausted in reality). + pr.MarkCooldown("A", time.Time{}, "") + if got, _ := pr.ResolveActive("pool"); got != "B" { + t.Fatalf("FLAP: after A cooldown lapse = %q, want B (must NOT snap back to A)", got) + } + // 3. B itself now exhausts -> advance forward with wrap. C is next. + pr.MarkCooldown("B", time.Now().Add(60*time.Second), "429") + if got, _ := pr.ResolveActive("pool"); got != "C" { + t.Fatalf("after cooling B = %q, want C (advance forward from B)", got) + } + // 4. C exhausts too -> wrap forward past end -> A (recovered at step 2). + pr.MarkCooldown("C", time.Now().Add(60*time.Second), "429") + if got, _ := pr.ResolveActive("pool"); got != "A" { + t.Fatalf("after cooling C = %q, want A (wrap forward, A is healthy again)", got) + } +} + +// TestResolveActiveStickyRotateAdvancesAndStays: `sluice pool rotate` parks +// the active member with ManualRotateReason; the next ResolveActive must +// advance to the next member and STAY there (no snap-back) even after the +// parked member's park lapses. +func TestResolveActiveStickyRotateAdvancesAndStays(t *testing.T) { + pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil) + if got, _ := pr.ResolveActive("pool"); got != "a" { + t.Fatalf("initial = %q, want a", got) + } + // Operator rotate: park the active "a". + pr.MarkCooldown("a", time.Now().Add(ManualRotateCooldownForTest()), ManualRotateReason) + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("after rotate = %q, want b (advance)", got) + } + // "a"'s park lapses: must NOT snap back, "b" stays active. + pr.MarkCooldown("a", time.Time{}, "") + for i := 0; i < 10; i++ { + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("call %d after park lapse = %q, want b (rotate advances AND stays)", i, got) + } + } +} + +// ManualRotateCooldownForTest is a small helper duration for the rotate test. +func ManualRotateCooldownForTest() time.Duration { return 300 * time.Second } + +// TestResolveActiveStickyPointerSurvivesRebuildAndSwap extends the CRITICAL-1 +// shared-health regression to the sticky pointer: a member switched-to on an +// OLD generation must remain the active member on a NEW generation built +// against the SAME shared PoolHealth, and a stale generation must not clobber +// it back to position 0. +func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) { + shared := NewPoolHealth() + gen1 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared) + if got, _ := gen1.ResolveActive("pool"); got != "a" { + t.Fatalf("gen1 initial = %q, want a", got) + } + // Failover on gen1 switches the sticky pointer to "b". + gen1.MarkCooldown("a", time.Now().Add(120*time.Second), "429") + if got, _ := gen1.ResolveActive("pool"); got != "b" { + t.Fatalf("gen1 after cooling a = %q, want b", got) + } + // "a" recovers (durable write may not have landed). + gen1.MarkCooldown("a", time.Time{}, "") + + // Reload: fresh generation, SAME shared health, store has no rows. + gen2 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared) + gen2.MergeLiveCooldowns(gen1) + // Sticky pointer survived the swap: gen2 keeps serving "b", not "a". + if got, _ := gen2.ResolveActive("pool"); got != "b" { + t.Fatalf("gen2 active = %q, want b (sticky pointer must survive resolver swap, no snap-back)", got) + } + + // A stale OLD generation's ResolveActive must not clobber the pointer to + // a member of the wrong/old member list. gen1 still has {a,b}; calling + // it again only ever writes a member of THIS gen's list. Even so, the + // authoritative current generation (gen2) must keep "b". + gen1.ResolveActive("pool") + if got, _ := gen2.ResolveActive("pool"); got != "b" { + t.Fatalf("after stale gen1 ResolveActive, gen2 = %q, want b (stale generation must not clobber sticky pointer)", got) + } + + // A pool dropped entirely prunes its sticky pointer (mirrors cooldown + // prune) so a re-add does not inherit a stale active member. + gen3 := NewPoolResolverShared([]store.Pool{mkPool("other", "x")}, nil, shared) + gen3.MergeLiveCooldowns(gen2) + gen4 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b"), mkPool("other", "x")}, nil, shared) + gen4.MergeLiveCooldowns(gen3) + if got, _ := gen4.ResolveActive("pool"); got != "a" { + t.Fatalf("re-added pool active = %q, want a (dropped pool's sticky pointer must be pruned)", got) + } +} + func TestResolveActivePassthroughForNonPool(t *testing.T) { pr := NewPoolResolver(nil, nil) got, ok := pr.ResolveActive("plain_cred") @@ -219,10 +350,12 @@ func TestMarkCooldownSynchronousFlip(t *testing.T) { if _, cooling := pr.CooldownUntil("a"); !cooling { t.Error("CooldownUntil(a) cooling=false, want true") } - // Clearing (zero/past) recovers the member. + // Clearing (zero/past) recovers the member, but selection is STICKY: + // "a" recovering does NOT snap the active member back to it. "b" was + // switched to and is healthy, so it keeps being served (flap fix). pr.MarkCooldown("a", time.Time{}, "") - if got, _ := pr.ResolveActive("pool"); got != "a" { - t.Errorf("after clear active = %q, want a", got) + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Errorf("after clear active = %q, want b (sticky: recovered a must NOT snap back)", got) } } From 22243f400a8676c74367763d1416f1009181dd13 Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 20:55:18 +0800 Subject: [PATCH 4/8] docs(pools): describe sticky active-member selection --- CLAUDE.md | 2 +- docs/plans/20260518-sticky-failover.md | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 181c737..eec0716 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -191,7 +191,7 @@ Auto-failover on 429/401 is primary; `pool rotate` is an override. - **Single chokepoint (I2):** every `binding.Credential` / `OAuthIndex.Has` / `extractInjectableSecret` / persist consumer on the HTTP/HTTPS OAuth path routes through `PoolResolver.ResolveActive` (`resolveInjectionTarget` for pass-1 header + pass-2 swap; `resolveOAuthResponseAttribution` for response/persist). `idx.Has` is always called with the resolved member, never the pool. Plain credentials pass through unchanged; SSH/mail are non-OAuth, out of scope. - **QUIC scope:** the HTTP/1.x/HTTP/2 MITM addon implements the full feature set (R1, R3, Phase 2). The HTTP/3/QUIC path (`QUICProxy.buildPhantomPairs`, binding-header injection in `quic.go`) is a request-side buffered swap with **no response-side OAuth interception**, but IS pool-aware on the request side: `QUICProxy.resolvePoolTarget` (via `NewQUICProxy`'s `poolResolver`) selects the active member's real secret and routes through `buildPooledOAuthPhantomPairs` so the access phantom is the same pool-stable JWT (R3 holds over QUIC). QUIC does **not** do R1 attribution or Phase 2 failover — the injected member is whatever the HTTP path / `pool rotate` last made active, and a QUIC-only 429/401 or refresh is not acted on. Deployments needing R1/auto-failover must route the pooled upstream over HTTP/HTTPS. -- **Active-member selection:** healthy or expired-cooldown members first, by configured position; if all are in cooldown, the soonest-recovering is returned with a WARNING (degrade, never hard-fail). Recovery is lazy (evaluated in `ResolveActive`, no scheduler). +- **Active-member selection (sticky, no "main" account):** `ResolveActive` is sticky. There is no position-0 "main": once selection settles on a member sluice keeps returning it while it is healthy, even if a lower-position member recovers from cooldown. A new active member is chosen only when the current one cools. The chosen member is the next eligible one by position **starting after the current member and wrapping** (advance forward, never snap back to position 0). The sticky current-active pointer lives on the shared swap-surviving `PoolHealth` under the same mutex as the cooldown map, so it survives `NewPoolResolverShared` regeneration and atomic pointer swaps (CRITICAL-1) and a stale resolver generation cannot clobber it, exactly like cooldowns. This kills the live flap where a 60s `RateLimitCooldown` lapse re-selected an upstream-exhausted account and respammed `cred_failover` plus a Telegram notice every cooldown window. If every member is cooling, the existing degrade applies (operator-parked-but-healthy `ManualRotateReason` first, else soonest-recovering, with a WARNING, never hard-fail) and the sticky pointer is left untouched so a recovery advances forward rather than snapping to position 0. `sluice pool rotate` still works: it parks the active member, so the next `ResolveActive` advances to the next member and then stays there (no snap-back). Recovery is lazy (evaluated in `ResolveActive`, no scheduler). A selectable position-priority-vs-sticky strategy mode is a possible follow-up and is out of scope here. - **R1 refresh-token attribution / fail-closed:** when pass-2 swaps `SLUICE_PHANTOM:.refresh`, sluice records `realRefreshToken -> member` in a short-TTL map; on the token-endpoint response it recovers the member by that real refresh token and persists to it (`persistAddonOAuthTokens(member, ...)`, singleflight `"persist:"+member`). The join key is the real **refresh** token — never the access token, connection, or `OAuthIndex.Match` (two pooled members share `auth.openai.com`'s token URL and collide). Unrecoverable -> WARNING + skip the write (rotating refresh tokens are single-use; a mis-attributed write bricks both accounts). **Plain-credential disambiguation:** a plain OAuth credential sharing a pool's token URL also tags its injected refresh token `realRefreshToken -> ` (plain path in `buildPhantomPairs`/`buildOAuthPhantomPairs`'s `onRefreshInject`, incl. split-host expansion); on response a recovered non-member (`PoolForMember == ""`) is attributed 1:1 to that plain credential, NOT fail-closed as pooled. The pooled fail-closed path applies only when recovery fails or resolves to an actual member; `poolForResponse` gates the same on an independent `flowInjected` tag (set post-swap only if a pool phantom was present) before cooling a member. - **R3 pool-stable phantom JWT:** Codex access tokens are JWTs; the per-real-token `resignJWT` would emit a different phantom after each cross-member refresh, breaking "agent never notices". `poolStablePhantomAccess` (`internal/proxy/oauth_response.go`) builds the phantom JWT from a deterministic synthetic payload keyed on the **pool name** (`sub: sluice-pool:`, `iss: sluice-phantom`, fixed far-future `exp`, no `iat`), HMAC-SHA256 with the fixed key — byte-identical across switches, structurally valid. Pool name is JSON-marshaled (never concatenated) so quotes/control chars can't inject claims. Static-form fallback (`SLUICE_PHANTOM:.access`) only on the unreachable `json.Marshal` failure. Refresh phantom stays static `SLUICE_PHANTOM:.refresh`. diff --git a/docs/plans/20260518-sticky-failover.md b/docs/plans/20260518-sticky-failover.md index 50e119f..52b7456 100644 --- a/docs/plans/20260518-sticky-failover.md +++ b/docs/plans/20260518-sticky-failover.md @@ -86,18 +86,18 @@ one place. ## Steps ### Task 1: Sticky selection in vault.PoolResolver -- [ ] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex) -- [ ] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics -- [ ] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe) -- [ ] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival -- [ ] `go test ./internal/vault/ -race`, gofumpt, vet +- [x] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex) +- [x] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics +- [x] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe) +- [x] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival +- [x] `go test ./internal/vault/ -race`, gofumpt, vet ### Task 2: Failover path + notification spam regression -- [ ] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive` -- [ ] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after) -- [ ] `go test ./internal/proxy/ -race`, gofumpt, vet +- [x] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive` +- [x] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after) +- [x] `go test ./internal/proxy/ -race`, gofumpt, vet ### Task 3: Docs + final validation -- [ ] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up -- [ ] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/` -- [ ] Independently verify committed HEAD builds and tests pass (do not trust subagent green) +- [x] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up +- [x] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/` +- [x] Independently verify committed HEAD builds and tests pass (do not trust subagent green) From eb07a8ab3b4bd6dc703e1bf9bec5a578d3f9bb5a Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 21:09:45 +0800 Subject: [PATCH 5/8] perf(vault): read-mostly fast path and epoch-scoped sticky pointer in ResolveActive --- internal/vault/pool.go | 157 ++++++++++++++++++++++++++++++----------- 1 file changed, 115 insertions(+), 42 deletions(-) diff --git a/internal/vault/pool.go b/internal/vault/pool.go index 7a3f5e9..fbce069 100644 --- a/internal/vault/pool.go +++ b/internal/vault/pool.go @@ -49,6 +49,20 @@ type memberIdentity struct { epoch int64 } +// activeEntry is the sticky pointer's value: the member ResolveActive last +// settled on PLUS the (pool, epoch) identity of the generation that wrote it. +// Finding 3: storing only the member NAME let a remove/re-create of the same +// pool name with an overlapping member name be accepted by a later +// generation as a valid sticky hold even though that name belongs to the OLD +// epoch/order. Carrying the epoch (the same monotonic membership epoch the +// cooldown gate keys on via memberIdentity) lets ResolveActive reject a +// stored sticky entry whose epoch no longer matches this generation's +// identity for the pool and advance fresh instead. +type activeEntry struct { + member string + epoch int64 +} + // PoolHealth is the mutex-guarded credential cooldown map. It is // deliberately a SEPARATE object from PoolResolver so it can outlive any // single resolver generation. @@ -107,9 +121,18 @@ type PoolHealth struct { // lock-ordering hazard). It survives swaps because every generation // shares one *PoolHealth; a stale generation cannot clobber it because // ResolveActive only ever writes a member of THIS generation's member - // list, and the MergeLiveCooldowns / SetCurrentMembers prune drops the - // pointer for any pool no longer present (mirrors the cooldown prune). - active map[string]string + // list, AND the entry is epoch-scoped (activeEntry.epoch): a write records + // the writing generation's membership epoch, and ResolveActive ignores a + // stored entry whose epoch no longer matches THIS generation's identity + // for the pool (Finding 3 — a same-pool-name re-create with overlapping + // member names bumps the epoch, so the stale sticky hold is rejected and + // the pointer advances fresh). Both live reload paths that swap the member + // set — NewPoolResolverShared -> SetCurrentMembers AND + // MergeLiveCooldowns's shared-map branch — prune this map for any pool no + // longer present, whose recorded member is no longer in that pool, or + // whose recorded epoch no longer matches the new generation (mirrors the + // cooldown prune). + active map[string]activeEntry } // NewPoolHealth returns an empty shared health map. Call this exactly once @@ -118,7 +141,7 @@ type PoolHealth struct { func NewPoolHealth() *PoolHealth { return &PoolHealth{ health: make(map[string]memberHealth), - active: make(map[string]string), + active: make(map[string]activeEntry), } } @@ -131,13 +154,39 @@ func NewPoolHealth() *PoolHealth { // observes the OLD member set entirely or the NEW one entirely — it can never // observe a half-updated set, and it can never slip a non-member cooldown in // between the prune and the member-set swap. +// +// It ALSO prunes the sticky-pointer map under the same lock so this live +// path stays consistent with MergeLiveCooldowns's shared-map prune (Finding +// 1): on the server reload path NewPoolResolverShared calls this BEFORE +// StorePool reaches MergeLiveCooldowns, so without pruning here a dropped or +// epoch-bumped pool would briefly keep a stale sticky entry that +// ResolveActive could observe. A sticky entry is dropped when the pool's +// recorded member is no longer present in the new member set, or its epoch +// no longer matches the new generation (Finding 3); an entry for a pool with +// no surviving members is also dropped (every member of that pool would be +// absent from `members`). func (ph *PoolHealth) SetCurrentMembers(members map[string]memberIdentity) { if ph == nil { return } ph.mu.Lock() + defer ph.mu.Unlock() ph.currentMembers = members - ph.mu.Unlock() + ph.pruneActiveLocked(members) +} + +// pruneActiveLocked drops sticky-pointer entries that are no longer valid for +// the generation described by `members` (cred -> pool+epoch). An entry +// survives only when its recorded member still maps to the SAME pool with the +// SAME epoch. Caller must hold ph.mu. Shared by SetCurrentMembers and +// MergeLiveCooldowns so both live reload paths prune identically. +func (ph *PoolHealth) pruneActiveLocked(members map[string]memberIdentity) { + for poolName, ent := range ph.active { + id, stillMember := members[ent.member] + if !stillMember || id.pool != poolName || id.epoch != ent.epoch { + delete(ph.active, poolName) + } + } } // Seed merges store-persisted cooldown rows into the shared map. It is @@ -344,33 +393,68 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { return "", false } - // Take the shared map's WRITE lock: the sticky pointer is mutated here - // when the active member is unhealthy/unset, and it must be consistent - // with the cooldown view a concurrent failover's MarkCooldown writes - // (same mu, no second lock, no lock-ordering hazard). - pr.health.mu.Lock() - defer pr.health.mu.Unlock() - - now := time.Now() + // THIS generation's membership epoch for the pool. Every member of a pool + // shares one epoch (the store stamps all rows of a membership generation + // with the same monotonic value), so the first member's identity epoch is + // the pool's epoch. Used to reject a sticky entry written by an older + // generation after a same-pool-name re-create bumped the epoch (Finding + // 3). identity is immutable for this resolver, so it is read lock-free. + genEpoch := pr.identity[members[0]].epoch cooling := func(m string) bool { + // Caller holds ph.mu (R or W). h, tracked := pr.health.health[m] - return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now) + return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(time.Now()) + } + + // Read-mostly fast path (Finding 2): the common case is a sticky hold — + // the recorded active member is still a member of THIS generation, its + // epoch matches, and it is not cooling. That requires no mutation, so it + // runs under the shared RLock and does not serialize with other resolves. + pr.health.mu.RLock() + if ent, set := pr.health.active[name]; set && ent.epoch == genEpoch { + for _, m := range members { + if m != ent.member { + continue + } + if !cooling(ent.member) { + pr.health.mu.RUnlock() + return ent.member, true + } + break + } } + pr.health.mu.RUnlock() + + // Slow path: the sticky pointer must be advanced/initialized (unset, + // stale-epoch, no longer a member, or cooling). Go's RWMutex has no + // in-place upgrade, so drop the RLock and take the WRITE lock, then + // RE-CHECK the sticky-hold condition: another goroutine may have advanced + // the pointer between RUnlock and Lock, in which case return it without + // re-advancing. The write lock keeps the sticky pointer consistent with + // the cooldown view a concurrent failover's MarkCooldown writes (same mu, + // no second lock, no lock-ordering hazard). + pr.health.mu.Lock() + defer pr.health.mu.Unlock() // Position of the current sticky member in THIS generation's member // list. startIdx == -1 (no valid current) makes the scan start at - // position 0; otherwise it starts AFTER cur and wraps. + // position 0; otherwise it starts AFTER cur and wraps. A stored entry + // whose epoch does not match this generation is treated as "no current + // active" (Finding 3): it belongs to an old epoch/order, so advance + // fresh rather than honor a cross-generation name collision. startIdx := -1 - if cur, set := pr.health.active[name]; set { + if ent, set := pr.health.active[name]; set && ent.epoch == genEpoch { for i, m := range members { - if m != cur { + if m != ent.member { continue } - // Sticky hold: the recorded active member is still a member of - // this generation and is healthy — keep serving it. Do not move. - if !cooling(cur) { - return cur, true + // Re-check under the write lock: still a member of this + // generation and healthy — another resolver may have just + // advanced here, or it was a benign RLock->Lock race. Keep + // serving it; do not move. + if !cooling(ent.member) { + return ent.member, true } startIdx = i break @@ -385,7 +469,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { idx := (startIdx + off) % n m := members[idx] if !cooling(m) { - pr.health.active[name] = m + pr.health.active[name] = activeEntry{member: m, epoch: genEpoch} return m, true } } @@ -593,26 +677,15 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) { pr.health.currentMembers = cm // Sticky-pointer prune (mirrors the cooldown prune above): drop the // recorded active member for any pool this generation no longer has, - // or whose recorded member is no longer in that pool. A surviving - // pool keeps its sticky member so a benign reload does NOT snap it - // back to position 0 (the whole point of CRITICAL-1 for the pointer). - for poolName, cur := range pr.health.active { - poolMembers, stillPool := pr.pools[poolName] - if !stillPool { - delete(pr.health.active, poolName) - continue - } - stillMember := false - for _, m := range poolMembers { - if m == cur { - stillMember = true - break - } - } - if !stillMember { - delete(pr.health.active, poolName) - } - } + // whose recorded member is no longer in that pool, or whose recorded + // epoch no longer matches this generation (Finding 3 — a same-pool- + // name re-create with overlapping member names bumps the epoch). A + // surviving pool with a still-valid same-epoch member keeps its sticky + // member so a benign reload does NOT snap it back to position 0 (the + // whole point of CRITICAL-1 for the pointer). Same predicate as + // SetCurrentMembers via the shared helper so both live reload paths + // prune identically (Finding 1). + pr.health.pruneActiveLocked(cm) pr.health.mu.Unlock() return } From a34000e8b295fcbf360002624075dddb318790ce Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 21:09:51 +0800 Subject: [PATCH 6/8] test(vault): cover sticky fast/advance paths, epoch clobber, and SetCurrentMembers prune --- internal/vault/pool_test.go | 112 ++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index 99acfa3..958bc59 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -769,3 +769,115 @@ func TestFinding3Round15_ConcurrentStaleMarkVsPruneUnderRace(t *testing.T) { } } } + +// TestResolveActiveStickyEpochClobberRegression is the Finding 3 regression. +// A pool name is removed and re-created with an OVERLAPPING member name but a +// strictly greater membership epoch (the store stamps every membership +// generation with a monotonic epoch). Before the fix the sticky pointer +// stored only the member NAME, so the new generation accepted the old +// generation's stored name as a valid sticky hold even though it belongs to +// the OLD epoch/order. Fail-before: gen2 honors the stale name and skips +// fresh position-0 selection. Pass-after: the epoch mismatch makes gen2 +// treat it as "no current active" and select by fresh position order. +func TestResolveActiveStickyEpochClobberRegression(t *testing.T) { + shared := NewPoolHealth() + const e1 = int64(1) + const e2 = int64(2) + + // gen1: pool P = [a, b] at epoch e1. Fail a over so the sticky pointer + // settles on "b". + gen1 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("P", e1, "a", "b")}, nil, shared) + if got, _ := gen1.ResolveActive("P"); got != "a" { + t.Fatalf("gen1 initial = %q, want a", got) + } + gen1.MarkCooldownScoped("a", "P", e1, time.Now().Add(300*time.Second), "failover:429") + if got, _ := gen1.ResolveActive("P"); got != "b" { + t.Fatalf("gen1 after cooling a = %q, want b", got) + } + + // P is removed and RE-CREATED with the SAME name but a DIFFERENT member + // order [b, a] at a strictly greater epoch e2. "b" still exists by name + // but at epoch e2 / position 0; the stored sticky entry (member "b", + // epoch e1) must NOT be honored — the new generation must pick by fresh + // position order, which is "b" at position 0. To make the test prove the + // epoch check (not just coincide), the new order is [c, b]: position 0 is + // "c". A name-only sticky pointer would wrongly return "b"; the + // epoch-scoped pointer rejects the stale entry and returns fresh + // position-0 "c". + gen2 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("P", e2, "c", "b")}, nil, shared) + gen2.MergeLiveCooldowns(gen1) + if got, ok := gen2.ResolveActive("P"); !ok || got != "c" { + t.Fatalf("Finding 3: stale-epoch sticky entry honored; got %q,%v want c,true "+ + "(epoch-bumped re-create must NOT inherit the old generation's sticky member)", got, ok) + } +} + +// TestResolveActiveFastPathAndAdvancePath exercises both Finding 2 lock +// paths for correctness: the RLock sticky-hold fast path (no mutation) and +// the write-lock advance path. It is a correctness test, not a benchmark. +func TestResolveActiveFastPathAndAdvancePath(t *testing.T) { + shared := NewPoolHealth() + pr := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared) + + // First call: no sticky entry -> advance/init path (write lock), settles a. + if got, _ := pr.ResolveActive("pool"); got != "a" { + t.Fatalf("init = %q, want a", got) + } + // Subsequent calls: sticky hold -> RLock fast path, must keep returning a. + for i := 0; i < 100; i++ { + if got, _ := pr.ResolveActive("pool"); got != "a" { + t.Fatalf("fast-path call %d = %q, want a (sticky hold)", i, got) + } + } + // Cool a -> next call takes the advance (write-lock) path and moves to b. + pr.MarkCooldown("a", time.Now().Add(120*time.Second), "429") + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("after cooling a = %q, want b (advance path)", got) + } + // Now b is the sticky hold: fast path again. + for i := 0; i < 100; i++ { + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Fatalf("fast-path call %d after advance = %q, want b", i, got) + } + } + // Concurrent resolves under the read-mostly pattern must all agree and be + // race-clean (run under -race). + var wg sync.WaitGroup + for i := 0; i < 64; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < 200; j++ { + if got, _ := pr.ResolveActive("pool"); got != "b" { + t.Errorf("concurrent resolve = %q, want b", got) + return + } + } + }() + } + wg.Wait() +} + +// TestSetCurrentMembersPrunesStaleActive pins Finding 1: SetCurrentMembers is +// a live reload path (NewPoolResolverShared calls it before StorePool reaches +// MergeLiveCooldowns), so it must itself prune a stale sticky entry. This +// drives SetCurrentMembers directly with a member set that drops the pool's +// recorded member and asserts the sticky pointer is gone. +func TestSetCurrentMembersPrunesStaleActive(t *testing.T) { + shared := NewPoolHealth() + pr := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared) + if got, _ := pr.ResolveActive("pool"); got != "a" { + t.Fatalf("init = %q, want a", got) + } + // New generation: pool no longer contains "a" (membership changed). Drive + // SetCurrentMembers directly with the new member set (b only, epoch 2). + shared.SetCurrentMembers(map[string]memberIdentity{ + "b": {pool: "pool", epoch: 2}, + }) + shared.mu.RLock() + _, stillActive := shared.active["pool"] + shared.mu.RUnlock() + if stillActive { + t.Fatal("Finding 1: SetCurrentMembers did not prune the stale sticky pointer for a dropped/epoch-bumped member") + } +} From ee609ab8248fcbcccaca5dcc8b2d5e08bc6eaeaa Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 21:21:51 +0800 Subject: [PATCH 7/8] perf(vault): single time.Now snapshot per ResolveActive and drop write lock before degrade logging --- internal/vault/pool.go | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/internal/vault/pool.go b/internal/vault/pool.go index fbce069..d818fc3 100644 --- a/internal/vault/pool.go +++ b/internal/vault/pool.go @@ -401,10 +401,20 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { // 3). identity is immutable for this resolver, so it is read lock-free. genEpoch := pr.identity[members[0]].epoch + // Finding 1: one time snapshot for the whole resolve. cooling() runs once + // per member in the scan loops; calling time.Now() per invocation made the + // cooldown gate observe a drifting clock within a single resolve (and was + // needless syscall churn). Capture now ONCE here so the RLock fast path and + // the write-lock advance/degrade slow path evaluate every member against + // one coherent instant. Semantics are unchanged: a member is cooling iff + // cooldownUntil.After(now). + now := time.Now() + cooling := func(m string) bool { - // Caller holds ph.mu (R or W). + // Caller holds ph.mu (R or W). Compared against the resolve-wide `now` + // snapshot (Finding 1), not a fresh time.Now() per call. h, tracked := pr.health.health[m] - return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(time.Now()) + return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now) } // Read-mostly fast path (Finding 2): the common case is a sticky hold — @@ -435,7 +445,14 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { // the cooldown view a concurrent failover's MarkCooldown writes (same mu, // no second lock, no lock-ordering hazard). pr.health.mu.Lock() - defer pr.health.mu.Unlock() + // Finding 3: the write lock is held ONLY for the sticky-pointer + // re-check, the advance mutation, and the degrade-target selection. + // Each slow-path exit Unlocks EXPLICITLY before its return/log.Printf + // (no `defer`), so the lock never spans logging or the value return and + // concurrent ResolveActive / MarkCooldown are not serialized behind log + // I/O. Every path below unlocks exactly once before returning; there is + // no path that returns while still holding the lock and none unlocks + // twice. // Position of the current sticky member in THIS generation's member // list. startIdx == -1 (no valid current) makes the scan start at @@ -454,6 +471,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { // advanced here, or it was a benign RLock->Lock race. Keep // serving it; do not move. if !cooling(ent.member) { + pr.health.mu.Unlock() return ent.member, true } startIdx = i @@ -470,6 +488,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { m := members[idx] if !cooling(m) { pr.health.active[name] = activeEntry{member: m, epoch: genEpoch} + pr.health.mu.Unlock() return m, true } } @@ -498,13 +517,22 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) { } } } + // Finding 3: the degrade TARGET (and the values the WARNING needs) are + // fully computed above under the write lock; the sticky pointer is + // deliberately NOT moved in the degrade case. Release the lock here, + // BEFORE the log.Printf and return, so logging never serializes + // concurrent ResolveActive / MarkCooldown. memberCount/name/soonest* + // are locals captured under the lock; the post-Unlock log/return only + // reads them, never the shared map. + memberCount := len(members) + pr.health.mu.Unlock() if soonestParked != "" { log.Printf("[POOL] all %d members of pool %q are cooling; degrading to operator-parked-but-healthy %q", - len(members), name, soonestParked) + memberCount, name, soonestParked) return soonestParked, true } log.Printf("[POOL] all %d members of pool %q are in cooldown; degrading to %q (recovers %s)", - len(members), name, soonest, soonestUntil.Format(time.RFC3339)) + memberCount, name, soonest, soonestUntil.Format(time.RFC3339)) return soonest, true } From 57e72e68f4404644cac9007f086bacf9e99da11e Mon Sep 17 00:00:00 2001 From: Nikita Nemirovsky Date: Mon, 18 May 2026 21:21:58 +0800 Subject: [PATCH 8/8] test(vault): use realistic epoch>=1 in sticky-pointer tests and add concurrent degrade no-double-unlock test --- internal/vault/pool_test.go | 69 ++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go index 958bc59..1318223 100644 --- a/internal/vault/pool_test.go +++ b/internal/vault/pool_test.go @@ -189,7 +189,13 @@ func ManualRotateCooldownForTest() time.Duration { return 300 * time.Second } // it back to position 0. func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) { shared := NewPoolHealth() - gen1 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared) + // Realistic epoch >= 1 (Finding 2): production membership rows are stamped + // with a monotonic epoch >= 1; epoch 0 is documented as "never live", so a + // zero-epoch "live" generation is unrealistic and could mask an + // epoch-guard regression. All generations here rebuild the SAME pool + // (membership unchanged) so they share epoch 1 — the sticky pointer must + // survive the swap precisely because the epoch still matches. + gen1 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared) if got, _ := gen1.ResolveActive("pool"); got != "a" { t.Fatalf("gen1 initial = %q, want a", got) } @@ -201,8 +207,10 @@ func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) { // "a" recovers (durable write may not have landed). gen1.MarkCooldown("a", time.Time{}, "") - // Reload: fresh generation, SAME shared health, store has no rows. - gen2 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared) + // Reload: fresh generation, SAME shared health, store has no rows. Same + // pool/membership -> same epoch 1, so the sticky entry's epoch still + // matches and the pointer must survive (not snap back). + gen2 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared) gen2.MergeLiveCooldowns(gen1) // Sticky pointer survived the swap: gen2 keeps serving "b", not "a". if got, _ := gen2.ResolveActive("pool"); got != "b" { @@ -220,15 +228,66 @@ func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) { // A pool dropped entirely prunes its sticky pointer (mirrors cooldown // prune) so a re-add does not inherit a stale active member. - gen3 := NewPoolResolverShared([]store.Pool{mkPool("other", "x")}, nil, shared) + gen3 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("other", 1, "x")}, nil, shared) gen3.MergeLiveCooldowns(gen2) - gen4 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b"), mkPool("other", "x")}, nil, shared) + gen4 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b"), mkPoolEpoch("other", 1, "x")}, nil, shared) gen4.MergeLiveCooldowns(gen3) if got, _ := gen4.ResolveActive("pool"); got != "a" { t.Fatalf("re-added pool active = %q, want a (dropped pool's sticky pointer must be pruned)", got) } } +// TestResolveActiveConcurrentDegradeNoDoubleUnlock pins Finding 3: the slow +// path now Unlocks the write lock EXPLICITLY (no defer) before every +// return/log.Printf, including the all-cooling degrade exit. A +// double-unlock or unlock-of-rlock would panic/race; a missed unlock would +// deadlock. Drive many concurrent ResolveActive calls with EVERY member +// cooling (forces the degrade exit every time) and assert the result is +// stable and the run is race-clean (run under -race). Behavior is unchanged: +// the soonest-recovering member is still returned and the sticky pointer is +// not moved. +func TestResolveActiveConcurrentDegradeNoDoubleUnlock(t *testing.T) { + now := time.Now() + health := []store.CredentialHealth{ + {Credential: "a", Status: "cooldown", CooldownUntil: now.Add(300 * time.Second), LastFailureReason: "401"}, + {Credential: "b", Status: "cooldown", CooldownUntil: now.Add(30 * time.Second), LastFailureReason: "429"}, + } + pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, health) + + var wg sync.WaitGroup + for i := 0; i < 64; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < 300; j++ { + got, ok := pr.ResolveActive("pool") + if !ok || got != "b" { + t.Errorf("degrade resolve = %q,%v; want b,true (soonest)", got, ok) + return + } + } + }() + } + wg.Wait() + // Concurrent MarkCooldown vs degrade ResolveActive: same mu, must be + // race-clean and never deadlock after the explicit-unlock restructure. + var wg2 sync.WaitGroup + wg2.Add(2) + go func() { + defer wg2.Done() + for j := 0; j < 500; j++ { + pr.ResolveActive("pool") + } + }() + go func() { + defer wg2.Done() + for j := 0; j < 500; j++ { + pr.MarkCooldown("a", time.Now().Add(300*time.Second), "401") + } + }() + wg2.Wait() +} + func TestResolveActivePassthroughForNonPool(t *testing.T) { pr := NewPoolResolver(nil, nil) got, ok := pr.ResolveActive("plain_cred")