From 40d069e464649c2c8936aa35fcf39d31cbd0c51f Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 20:44:57 +0800
Subject: [PATCH 1/8] docs(plans): sticky pool failover plan

---
 docs/plans/20260518-sticky-failover.md | 103 +++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 docs/plans/20260518-sticky-failover.md

diff --git a/docs/plans/20260518-sticky-failover.md b/docs/plans/20260518-sticky-failover.md
new file mode 100644
index 0000000..50e119f
--- /dev/null
+++ b/docs/plans/20260518-sticky-failover.md
@@ -0,0 +1,103 @@
+# Sticky Pool Failover (no "main" account)
+
+## Problem
+
+Live on knuth: `pool openai_pool failed over openai_oauth -> openai_oauth_2 (429)` repeats
+every few minutes (20+ messages). Hermes keeps working. Root cause is a flap:
+
+- `openai_oauth` is position 0 (the de-facto "main"). Its OpenAI quota is exhausted, so it
+  429s on every request.
+- On 429 sluice fails over to `openai_oauth_2` and parks `openai_oauth` for only
+  `vault.RateLimitCooldown` = 60s.
+- `PoolResolver.ResolveActive` picks the **first member in position order** that is healthy
+  or whose cooldown expired. 60s later `openai_oauth`'s cooldown lapses, so it is re-selected
+  even though `openai_oauth_2` was serving fine.
+- `openai_oauth` is still quota-exhausted upstream -> 429 -> failover again -> another
+  identical Telegram notice. Repeats on every request gap > 60s.
+
+The real OpenAI Codex quota window is hours/weekly, so re-probing the exhausted account
+every 60s and snapping back to it is wrong.
+
+## Desired behavior (user decision)
+
+Sticky failover: there is no "main" account. Stay on whichever member is currently active
+until **that** member exhausts, then advance to the next member. A lower-position member
+recovering from cooldown must NOT cause a switch back to it. A new failover (and its single
+Telegram notice) happens only on a genuine exhaustion transition, not on cooldown lapse.
+
+Mode toggle (position-priority vs sticky as selectable strategies) is explicitly out of
+scope for this change; sticky becomes the behavior. Note it as a possible follow-up.
+
+## Design
+
+`ResolveActive` is the single source of truth: `internal/proxy/pool_failover.go` calls
+`pr.ResolveActive(pool)` to compute the new active ("to") after cooling the old member, so
+making selection sticky fixes the flap, the failover events, and the notification spam in
+one place.
+
+1. Add a per-pool current-active pointer to the swap-surviving shared `PoolHealth`
+   (`internal/vault/pool.go`), guarded by the same mutex as the cooldown map, so it
+   survives `NewPoolResolverShared` regeneration and atomic swaps (same lifecycle as
+   cooldowns; carry it the way cooldowns are carried).
+2. Sticky `ResolveActive(pool)`:
+   - Let `cur` = shared current-active for the pool, if set and still a member of this
+     generation.
+   - If `cur` is set and healthy (no active cooldown) -> return `cur` (sticky hold).
+   - Else pick the next eligible member by position **starting after `cur`'s position and
+     wrapping** (so we advance forward, never snap back to position 0); skip members in
+     active cooldown; treat `ManualRotateReason` parks with the existing
+     better-degrade-target semantics. Set shared current-active to the picked member and
+     return it.
+   - If every member is cooling: keep the existing degrade behavior (operator-parked-but-
+     healthy first, else soonest-recovering) and do NOT move the sticky pointer, so when a
+     member recovers the next call advances to a healthy one rather than the absolute
+     position-0 member.
+3. Operator `sluice pool rotate` semantics unchanged at the surface: it still parks the
+   current active so the next `ResolveActive` advances; with sticky that advance lands on
+   the next member and stays there (no snap-back), which is the intended rotate behavior.
+4. Keep all existing concurrency invariants: sticky pointer reads/writes under the same
+   `PoolHealth.mu`; never lost across atomic pointer swap; a stale resolver generation must
+   not clobber it (mirror the cooldown CRITICAL-1 handling).
+
+## Out of scope
+
+- Strategy/mode toggle across CLI/REST/Telegram + schema (`credential_pools.strategy`) —
+  follow-up only; note the synergy, do not build.
+- Honoring upstream `Retry-After` / changing `RateLimitCooldown` constant — sticky makes
+  the short cooldown harmless (we no longer re-probe the cooled member until forced), so a
+  cooldown-duration change is unnecessary for this fix.
+
+## Testing strategy
+
+- Unit (vault): sticky hold — active member stays selected across many `ResolveActive`
+  calls while a lower-position member is healthy.
+- Unit (vault): flap regression — fail member A (cooldown), `ResolveActive` returns B; A's
+  cooldown expires; `ResolveActive` still returns B (no snap-back). B then cools ->
+  advance to next, wrapping.
+- Unit (vault): all-cooling degrade unchanged; operator-parked degrade-target preserved.
+- Unit (vault): sticky pointer survives `NewPoolResolverShared` regeneration + swap, and a
+  stale generation cannot clobber it (extend the existing CRITICAL-1 style test).
+- Unit (proxy): the failover path emits exactly ONE `cred_failover` + one notice per real
+  exhaustion transition, and emits NOTHING when a non-active member's cooldown merely
+  lapses (the spam regression, fail-before/pass-after).
+- Full `go test ./...`, `-race` on `internal/vault` and `internal/proxy`, gofumpt,
+  golangci-lint, `go vet ./...` and `-tags=e2e ./e2e/`.
+
+## Steps
+
+### Task 1: Sticky selection in vault.PoolResolver
+- [ ] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex)
+- [ ] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics
+- [ ] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe)
+- [ ] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival
+- [ ] `go test ./internal/vault/ -race`, gofumpt, vet
+
+### Task 2: Failover path + notification spam regression
+- [ ] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive`
+- [ ] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after)
+- [ ] `go test ./internal/proxy/ -race`, gofumpt, vet
+
+### Task 3: Docs + final validation
+- [ ] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up
+- [ ] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`
+- [ ] Independently verify committed HEAD builds and tests pass (do not trust subagent green)

From 8f4399485cf27172217a0d8fe05a796c21eb8160 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 20:55:04 +0800
Subject: [PATCH 2/8] fix(vault): sticky pool failover selection (no main
 account)

---
 internal/vault/pool.go | 128 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 113 insertions(+), 15 deletions(-)

diff --git a/internal/vault/pool.go b/internal/vault/pool.go
index 2ecb084..7a3f5e9 100644
--- a/internal/vault/pool.go
+++ b/internal/vault/pool.go
@@ -96,13 +96,30 @@ type PoolHealth struct {
 	// same-named successor) so the stale write must NOT apply. Mutated only
 	// under mu, the same lock the cooldown map uses.
 	currentMembers map[string]memberIdentity
+	// active maps a pool name -> the credential ResolveActive last settled
+	// on (the sticky pointer). It lives here, NOT on PoolResolver, for the
+	// exact same CRITICAL-1 reason cooldowns do: a membership change / SIGHUP
+	// / 2s data_version watcher rebuilds a fresh PoolResolver that the server
+	// atomically pointer-swaps, and ResolveActive on the new generation must
+	// keep serving the member it switched to instead of snapping back to
+	// position 0 (the flap that spams cred_failover + Telegram). Reads and
+	// writes are under the SAME mu as the cooldown map (no second lock, no
+	// lock-ordering hazard). It survives swaps because every generation
+	// shares one *PoolHealth; a stale generation cannot clobber it because
+	// ResolveActive only ever writes a member of THIS generation's member
+	// list, and the MergeLiveCooldowns / SetCurrentMembers prune drops the
+	// pointer for any pool no longer present (mirrors the cooldown prune).
+	active map[string]string
 }
 
 // NewPoolHealth returns an empty shared health map. Call this exactly once
 // per process and thread the result through every NewPoolResolver so all
 // resolver generations share one cooldown view.
 func NewPoolHealth() *PoolHealth {
-	return &PoolHealth{health: make(map[string]memberHealth)}
+	return &PoolHealth{
+		health: make(map[string]memberHealth),
+		active: make(map[string]string),
+	}
 }
 
 // SetCurrentMembers atomically replaces the authoritative member set for the
@@ -287,11 +304,32 @@ func (pr *PoolResolver) Members(pool string) []string {
 
 // ResolveActive expands a name to the credential that should actually be
 // used. For a plain credential (not a pool) the name is returned unchanged.
-// For a pool, the first member that is healthy or whose cooldown has expired
-// (in position order) is returned. If every member is still cooling down,
-// the member with the soonest recovery is returned and a WARNING is logged
-// (degraded: sluice keeps serving with the least-bad account rather than
-// failing the request outright).
+//
+// For a pool the selection is STICKY (there is no "main" / position-0
+// account):
+//
+//   - If a current-active member is recorded for this pool, is still a
+//     member of THIS generation, and is healthy (no active cooldown), it is
+//     returned unchanged — a sticky hold. A lower-position member recovering
+//     from cooldown does NOT cause a switch back to it (this is the flap fix:
+//     re-probing an exhausted-upstream member every 60s and snapping back to
+//     it was wrong and spammed cred_failover + Telegram).
+//   - Otherwise the next eligible member is chosen by position order
+//     STARTING AFTER the current-active member's position and WRAPPING
+//     (advance forward, never snap back to position 0). Members in active
+//     cooldown are skipped. The chosen member becomes the new sticky
+//     current-active for the pool.
+//   - If EVERY member is cooling, sluice keeps serving with the least-bad
+//     account (degraded): an operator-parked-but-healthy member
+//     (ManualRotateReason) is preferred over a genuinely failed one, else
+//     the soonest-recovering member; a WARNING is logged. The sticky pointer
+//     is NOT moved in this case, so when a member recovers the next call
+//     advances forward to it rather than snapping to absolute position 0.
+//
+// The current-active pointer lives on the shared *PoolHealth and is read/
+// written under the same mu as the cooldown map, so it survives resolver
+// pointer swaps (CRITICAL-1) and a stale generation cannot clobber it: a
+// write only ever stores a member of THIS generation's member list.
 func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 	if pr == nil {
 		return name, true
@@ -306,20 +344,58 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 		return "", false
 	}
 
-	// Read the shared health map under its RLock. A concurrent failover's
-	// MarkCooldown takes the same map's write lock, so this observes a
-	// consistent cooldown view regardless of resolver generation.
-	pr.health.mu.RLock()
-	defer pr.health.mu.RUnlock()
+	// Take the shared map's WRITE lock: the sticky pointer is mutated here
+	// when the active member is unhealthy/unset, and it must be consistent
+	// with the cooldown view a concurrent failover's MarkCooldown writes
+	// (same mu, no second lock, no lock-ordering hazard).
+	pr.health.mu.Lock()
+	defer pr.health.mu.Unlock()
 
 	now := time.Now()
-	var soonest, soonestParked string
-	var soonestUntil, soonestParkedUntil time.Time
-	for _, m := range members {
+
+	cooling := func(m string) bool {
 		h, tracked := pr.health.health[m]
-		if !tracked || h.cooldownUntil.IsZero() || !h.cooldownUntil.After(now) {
+		return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now)
+	}
+
+	// Position of the current sticky member in THIS generation's member
+	// list. startIdx == -1 (no valid current) makes the scan start at
+	// position 0; otherwise it starts AFTER cur and wraps.
+	startIdx := -1
+	if cur, set := pr.health.active[name]; set {
+		for i, m := range members {
+			if m != cur {
+				continue
+			}
+			// Sticky hold: the recorded active member is still a member of
+			// this generation and is healthy — keep serving it. Do not move.
+			if !cooling(cur) {
+				return cur, true
+			}
+			startIdx = i
+			break
+		}
+	}
+
+	// Advance forward from AFTER the current member (or position 0 when
+	// there is no valid current), wrapping, picking the first non-cooling
+	// member. Never snap back to position 0 on a flap.
+	n := len(members)
+	for off := 1; off <= n; off++ {
+		idx := (startIdx + off) % n
+		m := members[idx]
+		if !cooling(m) {
+			pr.health.active[name] = m
 			return m, true
 		}
+	}
+
+	// Every member is cooling: degrade (do NOT move the sticky pointer, so a
+	// recovery advances forward rather than snapping to position 0).
+	var soonest, soonestParked string
+	var soonestUntil, soonestParkedUntil time.Time
+	for _, m := range members {
+		h := pr.health.health[m]
 		if soonest == "" || h.cooldownUntil.Before(soonestUntil) {
 			soonest = m
 			soonestUntil = h.cooldownUntil
@@ -515,6 +591,28 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) {
 			cm[cred] = id
 		}
 		pr.health.currentMembers = cm
+		// Sticky-pointer prune (mirrors the cooldown prune above): drop the
+		// recorded active member for any pool this generation no longer has,
+		// or whose recorded member is no longer in that pool. A surviving
+		// pool keeps its sticky member so a benign reload does NOT snap it
+		// back to position 0 (the whole point of CRITICAL-1 for the pointer).
+		for poolName, cur := range pr.health.active {
+			poolMembers, stillPool := pr.pools[poolName]
+			if !stillPool {
+				delete(pr.health.active, poolName)
+				continue
+			}
+			stillMember := false
+			for _, m := range poolMembers {
+				if m == cur {
+					stillMember = true
+					break
+				}
+			}
+			if !stillMember {
+				delete(pr.health.active, poolName)
+			}
+		}
 		pr.health.mu.Unlock()
 		return
 	}

From e1b7ad1c46cc6899accbb590f240be4843e5de11 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 20:55:11 +0800
Subject: [PATCH 3/8] test(pool): sticky-hold, flap, advance-wrap,
 swap-survival and spam regressions

---
 internal/proxy/pool_failover_test.go | 107 ++++++++++++++++++++-
 internal/vault/pool_test.go          | 139 ++++++++++++++++++++++++++-
 2 files changed, 240 insertions(+), 6 deletions(-)

diff --git a/internal/proxy/pool_failover_test.go b/internal/proxy/pool_failover_test.go
index b039a61..652c984 100644
--- a/internal/proxy/pool_failover_test.go
+++ b/internal/proxy/pool_failover_test.go
@@ -193,11 +193,23 @@ func TestFailoverCooldownTTLAndLazyRecovery(t *testing.T) {
 		t.Fatalf("auth-fail cooldown TTL = %v, want ~%v", gotTTL, vault.AuthFailCooldown)
 	}
 
-	// Lazy recovery: force the cooldown to the past; ResolveActive must
-	// treat memA as eligible again with no background scheduler involved.
+	// Lazy recovery: force the cooldown to the past. memA becomes ELIGIBLE
+	// again with no background scheduler involved (CooldownUntil reports it
+	// is no longer cooling). Selection is STICKY, so memB - which we failed
+	// over to and which is healthy - remains active: a recovered lower-
+	// position member must NOT snap back (the flap fix). memA's eligibility
+	// only matters as a future advance target if memB later exhausts.
 	pr.MarkCooldown("memA", time.Now().Add(-time.Second), "expired")
+	if _, cooling := pr.CooldownUntil("memA"); cooling {
+		t.Fatal("after expiry memA must no longer be cooling (lazy recovery, no scheduler)")
+	}
+	if active, _ := pr.ResolveActive("codex_pool"); active != "memB" {
+		t.Fatalf("after expiry active = %q, want memB (sticky: recovered memA must NOT snap back)", active)
+	}
+	// Confirm memA IS the advance target once memB exhausts (wrap forward).
+	pr.MarkCooldown("memB", time.Now().Add(vault.RateLimitCooldown), "429")
 	if active, _ := pr.ResolveActive("codex_pool"); active != "memA" {
-		t.Fatalf("after expiry active = %q, want memA (lazy recovery)", active)
+		t.Fatalf("after memB exhausts active = %q, want memA (advance forward to recovered member)", active)
 	}
 }
 
@@ -346,6 +358,95 @@ func TestFailoverAuditEvent(t *testing.T) {
 	}
 }
 
+// TestStickyFailoverNoSpamOnNonActiveCooldownLapse is the notification-spam
+// regression for the live knuth flap. memA (position 0) is upstream-
+// exhausted. Sequence:
+//
+//  1. memA 429 -> exactly ONE cred_failover (memA->memB) + one notice.
+//  2. memA's short cooldown lapses. Under the OLD position-priority
+//     ResolveActive, the next request's injection would re-select memA
+//     (lower position, no longer cooling), hit the still-exhausted upstream,
+//     429 again, and emit ANOTHER identical failover + Telegram notice -
+//     repeating every cooldown window (the 20+ message spam).
+//  3. With sticky selection memB stays active after memA recovers, so the
+//     subsequent request resolves to memB, succeeds (200), and produces NO
+//     additional failover event or notice.
+//
+// Fail-before: step 3 would record a 2nd cred_failover (the flap).
+// Pass-after: exactly one cred_failover total, one notice total.
+func TestStickyFailoverNoSpamOnNonActiveCooldownLapse(t *testing.T) {
+	dir := t.TempDir()
+	logPath := filepath.Join(dir, "audit.log")
+	logger, err := audit.NewFileLogger(logPath)
+	if err != nil {
+		t.Fatalf("NewFileLogger: %v", err)
+	}
+	t.Cleanup(func() { _ = logger.Close() })
+
+	addon, _, prPtr := setupPoolAddon(t, "memA", "memB")
+	addon.auditLog = logger
+	client := setupAddonConn(addon, "auth.example.com:443")
+	pr := prPtr.Load()
+
+	var notices int
+	addon.SetOnFailover(func(FailoverEvent) { notices++ })
+
+	if got, _ := pr.ResolveActive("codex_pool"); got != "memA" {
+		t.Fatalf("pre = %q, want memA", got)
+	}
+
+	// 1. memA exhausts -> one real failover to memB.
+	f1 := newPoolRespFlow(client, 429, []byte(`{"error":"rate_limited"}`))
+	addon.flowInjected.Tag(f1.Id, "memA")
+	addon.Response(f1)
+	if got, _ := pr.ResolveActive("codex_pool"); got != "memB" {
+		t.Fatalf("after 429 active = %q, want memB", got)
+	}
+
+	// 2. memA's short cooldown lapses (still upstream-exhausted in reality).
+	pr.MarkCooldown("memA", time.Now().Add(-time.Second), "expired")
+
+	// The next request's injection resolves the pool. STICKY: it must stay
+	// on memB, NOT snap back to the recovered-but-still-exhausted memA. This
+	// is the single source of truth that kills the flap.
+	if got, _ := pr.ResolveActive("codex_pool"); got != "memB" {
+		t.Fatalf("FLAP: after memA cooldown lapse, injection resolves %q, want memB", got)
+	}
+
+	// 3. The subsequent legitimate request therefore hits memB and succeeds;
+	// a 200 is a documented no-op (no failover, no notice).
+	f2 := newPoolRespFlow(client, 200, []byte(`{"ok":true}`))
+	addon.flowInjected.Tag(f2.Id, "memB")
+	addon.Response(f2)
+
+	if err := logger.Close(); err != nil {
+		t.Fatalf("logger close: %v", err)
+	}
+	data, err := os.ReadFile(logPath)
+	if err != nil {
+		t.Fatalf("read audit log: %v", err)
+	}
+	failovers := 0
+	for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
+		if line == "" {
+			continue
+		}
+		var evt audit.Event
+		if uerr := json.Unmarshal([]byte(line), &evt); uerr != nil {
+			t.Fatalf("unmarshal %q: %v", line, uerr)
+		}
+		if evt.Action == "cred_failover" || evt.Action == "pool_exhausted" {
+			failovers++
+		}
+	}
+	if failovers != 1 {
+		t.Fatalf("got %d failover audit events, want exactly 1 (sticky must not flap on non-active cooldown lapse)\n%s", failovers, data)
+	}
+	if notices != 1 {
+		t.Fatalf("got %d failover notices, want exactly 1 (no Telegram spam on cooldown lapse)", notices)
+	}
+}
+
 // TestPoolForResponseResolvesActiveMember sanity-checks the destination ->
 // pool reverse mapping used by handlePoolFailover.
 func TestPoolForResponseResolvesActiveMember(t *testing.T) {
diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index fd61a2f..99acfa3 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -98,6 +98,137 @@ func TestMarkCooldownLegacyUnscopedStillGated(t *testing.T) {
 	}
 }
 
+// TestResolveActiveStickyHold: once a member is selected it keeps being
+// returned across many ResolveActive calls while it is healthy, even though a
+// lower-position member is also healthy. (Fail-before: old position-priority
+// always returned position-0 "a".)
+func TestResolveActiveStickyHold(t *testing.T) {
+	pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil)
+	// First resolution settles on "a" (position 0, both healthy).
+	if got, _ := pr.ResolveActive("pool"); got != "a" {
+		t.Fatalf("initial = %q, want a", got)
+	}
+	// "a" cools -> switch to "b".
+	pr.MarkCooldown("a", time.Now().Add(60*time.Second), "429")
+	if got, _ := pr.ResolveActive("pool"); got != "b" {
+		t.Fatalf("after cooling a = %q, want b", got)
+	}
+	// "a" recovers, but "b" is healthy: sticky hold across many calls.
+	pr.MarkCooldown("a", time.Time{}, "")
+	for i := 0; i < 25; i++ {
+		if got, _ := pr.ResolveActive("pool"); got != "b" {
+			t.Fatalf("call %d: sticky hold broke, got %q want b (lower-position a recovered must NOT snap back)", i, got)
+		}
+	}
+}
+
+// TestResolveActiveFlapRegression is the core flap fix. Sequence mirrors the
+// live knuth bug: A (position 0) is upstream-exhausted. Fail A (cooldown) ->
+// ResolveActive returns B. A's cooldown EXPIRES -> ResolveActive STILL returns
+// B (no snap-back, so A is not re-probed every 60s and no spurious failover).
+// Then B itself cools -> advance to the next member WITH WRAP (back to A,
+// which has recovered). Fail-before (position-priority): step 3 would return
+// A, the flap.
+func TestResolveActiveFlapRegression(t *testing.T) {
+	pr := NewPoolResolver([]store.Pool{mkPool("pool", "A", "B", "C")}, nil)
+	if got, _ := pr.ResolveActive("pool"); got != "A" {
+		t.Fatalf("initial = %q, want A", got)
+	}
+	// 1. A exhausts -> failover to B.
+	pr.MarkCooldown("A", time.Now().Add(60*time.Second), "429")
+	if got, _ := pr.ResolveActive("pool"); got != "B" {
+		t.Fatalf("after cooling A = %q, want B", got)
+	}
+	// 2. A's short cooldown lapses (still upstream-exhausted in reality).
+	pr.MarkCooldown("A", time.Time{}, "")
+	if got, _ := pr.ResolveActive("pool"); got != "B" {
+		t.Fatalf("FLAP: after A cooldown lapse = %q, want B (must NOT snap back to A)", got)
+	}
+	// 3. B itself now exhausts -> advance forward with wrap. C is next.
+	pr.MarkCooldown("B", time.Now().Add(60*time.Second), "429")
+	if got, _ := pr.ResolveActive("pool"); got != "C" {
+		t.Fatalf("after cooling B = %q, want C (advance forward from B)", got)
+	}
+	// 4. C exhausts too -> wrap forward past end -> A (recovered at step 2).
+	pr.MarkCooldown("C", time.Now().Add(60*time.Second), "429")
+	if got, _ := pr.ResolveActive("pool"); got != "A" {
+		t.Fatalf("after cooling C = %q, want A (wrap forward, A is healthy again)", got)
+	}
+}
+
+// TestResolveActiveStickyRotateAdvancesAndStays: `sluice pool rotate` parks
+// the active member with ManualRotateReason; the next ResolveActive must
+// advance to the next member and STAY there (no snap-back) even after the
+// parked member's park lapses.
+func TestResolveActiveStickyRotateAdvancesAndStays(t *testing.T) {
+	pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, nil)
+	if got, _ := pr.ResolveActive("pool"); got != "a" {
+		t.Fatalf("initial = %q, want a", got)
+	}
+	// Operator rotate: park the active "a".
+	pr.MarkCooldown("a", time.Now().Add(ManualRotateCooldownForTest()), ManualRotateReason)
+	if got, _ := pr.ResolveActive("pool"); got != "b" {
+		t.Fatalf("after rotate = %q, want b (advance)", got)
+	}
+	// "a"'s park lapses: must NOT snap back, "b" stays active.
+	pr.MarkCooldown("a", time.Time{}, "")
+	for i := 0; i < 10; i++ {
+		if got, _ := pr.ResolveActive("pool"); got != "b" {
+			t.Fatalf("call %d after park lapse = %q, want b (rotate advances AND stays)", i, got)
+		}
+	}
+}
+
+// ManualRotateCooldownForTest is a small helper duration for the rotate test.
+func ManualRotateCooldownForTest() time.Duration { return 300 * time.Second }
+
+// TestResolveActiveStickyPointerSurvivesRebuildAndSwap extends the CRITICAL-1
+// shared-health regression to the sticky pointer: a member switched-to on an
+// OLD generation must remain the active member on a NEW generation built
+// against the SAME shared PoolHealth, and a stale generation must not clobber
+// it back to position 0.
+func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) {
+	shared := NewPoolHealth()
+	gen1 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared)
+	if got, _ := gen1.ResolveActive("pool"); got != "a" {
+		t.Fatalf("gen1 initial = %q, want a", got)
+	}
+	// Failover on gen1 switches the sticky pointer to "b".
+	gen1.MarkCooldown("a", time.Now().Add(120*time.Second), "429")
+	if got, _ := gen1.ResolveActive("pool"); got != "b" {
+		t.Fatalf("gen1 after cooling a = %q, want b", got)
+	}
+	// "a" recovers (durable write may not have landed).
+	gen1.MarkCooldown("a", time.Time{}, "")
+
+	// Reload: fresh generation, SAME shared health, store has no rows.
+	gen2 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared)
+	gen2.MergeLiveCooldowns(gen1)
+	// Sticky pointer survived the swap: gen2 keeps serving "b", not "a".
+	if got, _ := gen2.ResolveActive("pool"); got != "b" {
+		t.Fatalf("gen2 active = %q, want b (sticky pointer must survive resolver swap, no snap-back)", got)
+	}
+
+	// A stale OLD generation's ResolveActive must not clobber the pointer to
+	// a member of the wrong/old member list. gen1 still has {a,b}; calling
+	// it again only ever writes a member of THIS gen's list. Even so, the
+	// authoritative current generation (gen2) must keep "b".
+	gen1.ResolveActive("pool")
+	if got, _ := gen2.ResolveActive("pool"); got != "b" {
+		t.Fatalf("after stale gen1 ResolveActive, gen2 = %q, want b (stale generation must not clobber sticky pointer)", got)
+	}
+
+	// A pool dropped entirely prunes its sticky pointer (mirrors cooldown
+	// prune) so a re-add does not inherit a stale active member.
+	gen3 := NewPoolResolverShared([]store.Pool{mkPool("other", "x")}, nil, shared)
+	gen3.MergeLiveCooldowns(gen2)
+	gen4 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b"), mkPool("other", "x")}, nil, shared)
+	gen4.MergeLiveCooldowns(gen3)
+	if got, _ := gen4.ResolveActive("pool"); got != "a" {
+		t.Fatalf("re-added pool active = %q, want a (dropped pool's sticky pointer must be pruned)", got)
+	}
+}
+
 func TestResolveActivePassthroughForNonPool(t *testing.T) {
 	pr := NewPoolResolver(nil, nil)
 	got, ok := pr.ResolveActive("plain_cred")
@@ -219,10 +350,12 @@ func TestMarkCooldownSynchronousFlip(t *testing.T) {
 	if _, cooling := pr.CooldownUntil("a"); !cooling {
 		t.Error("CooldownUntil(a) cooling=false, want true")
 	}
-	// Clearing (zero/past) recovers the member.
+	// Clearing (zero/past) recovers the member, but selection is STICKY:
+	// "a" recovering does NOT snap the active member back to it. "b" was
+	// switched to and is healthy, so it keeps being served (flap fix).
 	pr.MarkCooldown("a", time.Time{}, "")
-	if got, _ := pr.ResolveActive("pool"); got != "a" {
-		t.Errorf("after clear active = %q, want a", got)
+	if got, _ := pr.ResolveActive("pool"); got != "b" {
+		t.Errorf("after clear active = %q, want b (sticky: recovered a must NOT snap back)", got)
 	}
 }
 

From 22243f400a8676c74367763d1416f1009181dd13 Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 20:55:18 +0800
Subject: [PATCH 4/8] docs(pools): describe sticky active-member selection

---
 CLAUDE.md                              |  2 +-
 docs/plans/20260518-sticky-failover.md | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 181c737..eec0716 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -191,7 +191,7 @@ Auto-failover on 429/401 is primary; `pool rotate` is an override.
 
 - **Single chokepoint (I2):** every `binding.Credential` / `OAuthIndex.Has` / `extractInjectableSecret` / persist consumer on the HTTP/HTTPS OAuth path routes through `PoolResolver.ResolveActive` (`resolveInjectionTarget` for pass-1 header + pass-2 swap; `resolveOAuthResponseAttribution` for response/persist). `idx.Has` is always called with the resolved member, never the pool. Plain credentials pass through unchanged; SSH/mail are non-OAuth, out of scope.
 - **QUIC scope:** the HTTP/1.x/HTTP/2 MITM addon implements the full feature set (R1, R3, Phase 2). The HTTP/3/QUIC path (`QUICProxy.buildPhantomPairs`, binding-header injection in `quic.go`) is a request-side buffered swap with **no response-side OAuth interception**, but IS pool-aware on the request side: `QUICProxy.resolvePoolTarget` (via `NewQUICProxy`'s `poolResolver`) selects the active member's real secret and routes through `buildPooledOAuthPhantomPairs` so the access phantom is the same pool-stable JWT (R3 holds over QUIC). QUIC does **not** do R1 attribution or Phase 2 failover — the injected member is whatever the HTTP path / `pool rotate` last made active, and a QUIC-only 429/401 or refresh is not acted on. Deployments needing R1/auto-failover must route the pooled upstream over HTTP/HTTPS.
-- **Active-member selection:** healthy or expired-cooldown members first, by configured position; if all are in cooldown, the soonest-recovering is returned with a WARNING (degrade, never hard-fail). Recovery is lazy (evaluated in `ResolveActive`, no scheduler).
+- **Active-member selection (sticky, no "main" account):** `ResolveActive` is sticky. There is no position-0 "main": once selection settles on a member sluice keeps returning it while it is healthy, even if a lower-position member recovers from cooldown. A new active member is chosen only when the current one cools. The chosen member is the next eligible one by position **starting after the current member and wrapping** (advance forward, never snap back to position 0). The sticky current-active pointer lives on the shared swap-surviving `PoolHealth` under the same mutex as the cooldown map, so it survives `NewPoolResolverShared` regeneration and atomic pointer swaps (CRITICAL-1) and a stale resolver generation cannot clobber it, exactly like cooldowns. This kills the live flap where a 60s `RateLimitCooldown` lapse re-selected an upstream-exhausted account and respammed `cred_failover` plus a Telegram notice every cooldown window. If every member is cooling, the existing degrade applies (operator-parked-but-healthy `ManualRotateReason` first, else soonest-recovering, with a WARNING, never hard-fail) and the sticky pointer is left untouched so a recovery advances forward rather than snapping to position 0. `sluice pool rotate` still works: it parks the active member, so the next `ResolveActive` advances to the next member and then stays there (no snap-back). Recovery is lazy (evaluated in `ResolveActive`, no scheduler). A selectable position-priority-vs-sticky strategy mode is a possible follow-up and is out of scope here.
 - **R1 refresh-token attribution / fail-closed:** when pass-2 swaps `SLUICE_PHANTOM:<pool>.refresh`, sluice records `realRefreshToken -> member` in a short-TTL map; on the token-endpoint response it recovers the member by that real refresh token and persists to it (`persistAddonOAuthTokens(member, ...)`, singleflight `"persist:"+member`). The join key is the real **refresh** token — never the access token, connection, or `OAuthIndex.Match` (two pooled members share `auth.openai.com`'s token URL and collide). Unrecoverable -> WARNING + skip the write (rotating refresh tokens are single-use; a mis-attributed write bricks both accounts). **Plain-credential disambiguation:** a plain OAuth credential sharing a pool's token URL also tags its injected refresh token `realRefreshToken -> <plain name>` (plain path in `buildPhantomPairs`/`buildOAuthPhantomPairs`'s `onRefreshInject`, incl. split-host expansion); on response a recovered non-member (`PoolForMember == ""`) is attributed 1:1 to that plain credential, NOT fail-closed as pooled. The pooled fail-closed path applies only when recovery fails or resolves to an actual member; `poolForResponse` gates the same on an independent `flowInjected` tag (set post-swap only if a pool phantom was present) before cooling a member.
 - **R3 pool-stable phantom JWT:** Codex access tokens are JWTs; the per-real-token `resignJWT` would emit a different phantom after each cross-member refresh, breaking "agent never notices". `poolStablePhantomAccess` (`internal/proxy/oauth_response.go`) builds the phantom JWT from a deterministic synthetic payload keyed on the **pool name** (`sub: sluice-pool:<pool>`, `iss: sluice-phantom`, fixed far-future `exp`, no `iat`), HMAC-SHA256 with the fixed key — byte-identical across switches, structurally valid. Pool name is JSON-marshaled (never concatenated) so quotes/control chars can't inject claims. Static-form fallback (`SLUICE_PHANTOM:<pool>.access`) only on the unreachable `json.Marshal` failure. Refresh phantom stays static `SLUICE_PHANTOM:<pool>.refresh`.
 
diff --git a/docs/plans/20260518-sticky-failover.md b/docs/plans/20260518-sticky-failover.md
index 50e119f..52b7456 100644
--- a/docs/plans/20260518-sticky-failover.md
+++ b/docs/plans/20260518-sticky-failover.md
@@ -86,18 +86,18 @@ one place.
 ## Steps
 
 ### Task 1: Sticky selection in vault.PoolResolver
-- [ ] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex)
-- [ ] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics
-- [ ] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe)
-- [ ] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival
-- [ ] `go test ./internal/vault/ -race`, gofumpt, vet
+- [x] Add swap-surviving per-pool current-active to shared `PoolHealth` (same mutex)
+- [x] Rewrite `ResolveActive` to the sticky algorithm above; preserve degrade + parked semantics
+- [x] Preserve CRITICAL-1 invariants (no loss/clobber across swap; stale generation safe)
+- [x] Unit tests: sticky hold, flap regression (no snap-back), advance+wrap, degrade unchanged, swap-survival
+- [x] `go test ./internal/vault/ -race`, gofumpt, vet
 
 ### Task 2: Failover path + notification spam regression
-- [ ] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive`
-- [ ] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after)
-- [ ] `go test ./internal/proxy/ -race`, gofumpt, vet
+- [x] Confirm `pool_failover.go` from->to now changes only on real exhaustion (sticky source of truth); adjust only if it bypasses `ResolveActive`
+- [x] Test: one `cred_failover`+notice per real transition; zero events when a non-active member's cooldown lapses (fail-before/pass-after)
+- [x] `go test ./internal/proxy/ -race`, gofumpt, vet
 
 ### Task 3: Docs + final validation
-- [ ] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up
-- [ ] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`
-- [ ] Independently verify committed HEAD builds and tests pass (do not trust subagent green)
+- [x] Update CLAUDE.md credential-pools section to describe sticky selection (replace the position-priority wording) and note the mode-toggle follow-up
+- [x] `gofumpt -l` clean; `golangci-lint run ./...` 0 issues; full `go test ./...`; `go vet ./...`; `go vet -tags=e2e ./e2e/`
+- [x] Independently verify committed HEAD builds and tests pass (do not trust subagent green)

From eb07a8ab3b4bd6dc703e1bf9bec5a578d3f9bb5a Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 21:09:45 +0800
Subject: [PATCH 5/8] perf(vault): read-mostly fast path and epoch-scoped
 sticky pointer in ResolveActive

---
 internal/vault/pool.go | 157 ++++++++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 42 deletions(-)

diff --git a/internal/vault/pool.go b/internal/vault/pool.go
index 7a3f5e9..fbce069 100644
--- a/internal/vault/pool.go
+++ b/internal/vault/pool.go
@@ -49,6 +49,20 @@ type memberIdentity struct {
 	epoch int64
 }
 
+// activeEntry is the sticky pointer's value: the member ResolveActive last
+// settled on PLUS the (pool, epoch) identity of the generation that wrote it.
+// Finding 3: storing only the member NAME let a remove/re-create of the same
+// pool name with an overlapping member name be accepted by a later
+// generation as a valid sticky hold even though that name belongs to the OLD
+// epoch/order. Carrying the epoch (the same monotonic membership epoch the
+// cooldown gate keys on via memberIdentity) lets ResolveActive reject a
+// stored sticky entry whose epoch no longer matches this generation's
+// identity for the pool and advance fresh instead.
+type activeEntry struct {
+	member string
+	epoch  int64
+}
+
 // PoolHealth is the mutex-guarded credential cooldown map. It is
 // deliberately a SEPARATE object from PoolResolver so it can outlive any
 // single resolver generation.
@@ -107,9 +121,18 @@ type PoolHealth struct {
 	// lock-ordering hazard). It survives swaps because every generation
 	// shares one *PoolHealth; a stale generation cannot clobber it because
 	// ResolveActive only ever writes a member of THIS generation's member
-	// list, and the MergeLiveCooldowns / SetCurrentMembers prune drops the
-	// pointer for any pool no longer present (mirrors the cooldown prune).
-	active map[string]string
+	// list, AND the entry is epoch-scoped (activeEntry.epoch): a write records
+	// the writing generation's membership epoch, and ResolveActive ignores a
+	// stored entry whose epoch no longer matches THIS generation's identity
+	// for the pool (Finding 3 — a same-pool-name re-create with overlapping
+	// member names bumps the epoch, so the stale sticky hold is rejected and
+	// the pointer advances fresh). Both live reload paths that swap the member
+	// set — NewPoolResolverShared -> SetCurrentMembers AND
+	// MergeLiveCooldowns's shared-map branch — prune this map for any pool no
+	// longer present, whose recorded member is no longer in that pool, or
+	// whose recorded epoch no longer matches the new generation (mirrors the
+	// cooldown prune).
+	active map[string]activeEntry
 }
 
 // NewPoolHealth returns an empty shared health map. Call this exactly once
@@ -118,7 +141,7 @@ type PoolHealth struct {
 func NewPoolHealth() *PoolHealth {
 	return &PoolHealth{
 		health: make(map[string]memberHealth),
-		active: make(map[string]string),
+		active: make(map[string]activeEntry),
 	}
 }
 
@@ -131,13 +154,39 @@ func NewPoolHealth() *PoolHealth {
 // observes the OLD member set entirely or the NEW one entirely — it can never
 // observe a half-updated set, and it can never slip a non-member cooldown in
 // between the prune and the member-set swap.
+//
+// It ALSO prunes the sticky-pointer map under the same lock so this live
+// path stays consistent with MergeLiveCooldowns's shared-map prune (Finding
+// 1): on the server reload path NewPoolResolverShared calls this BEFORE
+// StorePool reaches MergeLiveCooldowns, so without pruning here a dropped or
+// epoch-bumped pool would briefly keep a stale sticky entry that
+// ResolveActive could observe. A sticky entry is dropped when the pool's
+// recorded member is no longer present in the new member set, or its epoch
+// no longer matches the new generation (Finding 3); an entry for a pool with
+// no surviving members is also dropped (every member of that pool would be
+// absent from `members`).
 func (ph *PoolHealth) SetCurrentMembers(members map[string]memberIdentity) {
 	if ph == nil {
 		return
 	}
 	ph.mu.Lock()
+	defer ph.mu.Unlock()
 	ph.currentMembers = members
-	ph.mu.Unlock()
+	ph.pruneActiveLocked(members)
+}
+
+// pruneActiveLocked drops sticky-pointer entries that are no longer valid for
+// the generation described by `members` (cred -> pool+epoch). An entry
+// survives only when its recorded member still maps to the SAME pool with the
+// SAME epoch. Caller must hold ph.mu. Shared by SetCurrentMembers and
+// MergeLiveCooldowns so both live reload paths prune identically.
+func (ph *PoolHealth) pruneActiveLocked(members map[string]memberIdentity) {
+	for poolName, ent := range ph.active {
+		id, stillMember := members[ent.member]
+		if !stillMember || id.pool != poolName || id.epoch != ent.epoch {
+			delete(ph.active, poolName)
+		}
+	}
 }
 
 // Seed merges store-persisted cooldown rows into the shared map. It is
@@ -344,33 +393,68 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 		return "", false
 	}
 
-	// Take the shared map's WRITE lock: the sticky pointer is mutated here
-	// when the active member is unhealthy/unset, and it must be consistent
-	// with the cooldown view a concurrent failover's MarkCooldown writes
-	// (same mu, no second lock, no lock-ordering hazard).
-	pr.health.mu.Lock()
-	defer pr.health.mu.Unlock()
-
-	now := time.Now()
+	// THIS generation's membership epoch for the pool. Every member of a pool
+	// shares one epoch (the store stamps all rows of a membership generation
+	// with the same monotonic value), so the first member's identity epoch is
+	// the pool's epoch. Used to reject a sticky entry written by an older
+	// generation after a same-pool-name re-create bumped the epoch (Finding
+	// 3). identity is immutable for this resolver, so it is read lock-free.
+	genEpoch := pr.identity[members[0]].epoch
 
 	cooling := func(m string) bool {
+		// Caller holds ph.mu (R or W).
 		h, tracked := pr.health.health[m]
-		return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now)
+		return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(time.Now())
+	}
+
+	// Read-mostly fast path (Finding 2): the common case is a sticky hold —
+	// the recorded active member is still a member of THIS generation, its
+	// epoch matches, and it is not cooling. That requires no mutation, so it
+	// runs under the shared RLock and does not serialize with other resolves.
+	pr.health.mu.RLock()
+	if ent, set := pr.health.active[name]; set && ent.epoch == genEpoch {
+		for _, m := range members {
+			if m != ent.member {
+				continue
+			}
+			if !cooling(ent.member) {
+				pr.health.mu.RUnlock()
+				return ent.member, true
+			}
+			break
+		}
 	}
+	pr.health.mu.RUnlock()
+
+	// Slow path: the sticky pointer must be advanced/initialized (unset,
+	// stale-epoch, no longer a member, or cooling). Go's RWMutex has no
+	// in-place upgrade, so drop the RLock and take the WRITE lock, then
+	// RE-CHECK the sticky-hold condition: another goroutine may have advanced
+	// the pointer between RUnlock and Lock, in which case return it without
+	// re-advancing. The write lock keeps the sticky pointer consistent with
+	// the cooldown view a concurrent failover's MarkCooldown writes (same mu,
+	// no second lock, no lock-ordering hazard).
+	pr.health.mu.Lock()
+	defer pr.health.mu.Unlock()
 
 	// Position of the current sticky member in THIS generation's member
 	// list. startIdx == -1 (no valid current) makes the scan start at
-	// position 0; otherwise it starts AFTER cur and wraps.
+	// position 0; otherwise it starts AFTER cur and wraps. A stored entry
+	// whose epoch does not match this generation is treated as "no current
+	// active" (Finding 3): it belongs to an old epoch/order, so advance
+	// fresh rather than honor a cross-generation name collision.
 	startIdx := -1
-	if cur, set := pr.health.active[name]; set {
+	if ent, set := pr.health.active[name]; set && ent.epoch == genEpoch {
 		for i, m := range members {
-			if m != cur {
+			if m != ent.member {
 				continue
 			}
-			// Sticky hold: the recorded active member is still a member of
-			// this generation and is healthy — keep serving it. Do not move.
-			if !cooling(cur) {
-				return cur, true
+			// Re-check under the write lock: still a member of this
+			// generation and healthy — another resolver may have just
+			// advanced here, or it was a benign RLock->Lock race. Keep
+			// serving it; do not move.
+			if !cooling(ent.member) {
+				return ent.member, true
 			}
 			startIdx = i
 			break
@@ -385,7 +469,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 		idx := (startIdx + off) % n
 		m := members[idx]
 		if !cooling(m) {
-			pr.health.active[name] = m
+			pr.health.active[name] = activeEntry{member: m, epoch: genEpoch}
 			return m, true
 		}
 	}
@@ -593,26 +677,15 @@ func (pr *PoolResolver) MergeLiveCooldowns(prev *PoolResolver) {
 		pr.health.currentMembers = cm
 		// Sticky-pointer prune (mirrors the cooldown prune above): drop the
 		// recorded active member for any pool this generation no longer has,
-		// or whose recorded member is no longer in that pool. A surviving
-		// pool keeps its sticky member so a benign reload does NOT snap it
-		// back to position 0 (the whole point of CRITICAL-1 for the pointer).
-		for poolName, cur := range pr.health.active {
-			poolMembers, stillPool := pr.pools[poolName]
-			if !stillPool {
-				delete(pr.health.active, poolName)
-				continue
-			}
-			stillMember := false
-			for _, m := range poolMembers {
-				if m == cur {
-					stillMember = true
-					break
-				}
-			}
-			if !stillMember {
-				delete(pr.health.active, poolName)
-			}
-		}
+		// whose recorded member is no longer in that pool, or whose recorded
+		// epoch no longer matches this generation (Finding 3 — a same-pool-
+		// name re-create with overlapping member names bumps the epoch). A
+		// surviving pool with a still-valid same-epoch member keeps its sticky
+		// member so a benign reload does NOT snap it back to position 0 (the
+		// whole point of CRITICAL-1 for the pointer). Same predicate as
+		// SetCurrentMembers via the shared helper so both live reload paths
+		// prune identically (Finding 1).
+		pr.health.pruneActiveLocked(cm)
 		pr.health.mu.Unlock()
 		return
 	}

From a34000e8b295fcbf360002624075dddb318790ce Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 21:09:51 +0800
Subject: [PATCH 6/8] test(vault): cover sticky fast/advance paths, epoch
 clobber, and SetCurrentMembers prune

---
 internal/vault/pool_test.go | 112 ++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index 99acfa3..958bc59 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -769,3 +769,115 @@ func TestFinding3Round15_ConcurrentStaleMarkVsPruneUnderRace(t *testing.T) {
 		}
 	}
 }
+
+// TestResolveActiveStickyEpochClobberRegression is the Finding 3 regression.
+// A pool name is removed and re-created with an OVERLAPPING member name but a
+// strictly greater membership epoch (the store stamps every membership
+// generation with a monotonic epoch). Before the fix the sticky pointer
+// stored only the member NAME, so the new generation accepted the old
+// generation's stored name as a valid sticky hold even though it belongs to
+// the OLD epoch/order. Fail-before: gen2 honors the stale name and skips
+// fresh position-0 selection. Pass-after: the epoch mismatch makes gen2
+// treat it as "no current active" and select by fresh position order.
+func TestResolveActiveStickyEpochClobberRegression(t *testing.T) {
+	shared := NewPoolHealth()
+	const e1 = int64(1)
+	const e2 = int64(2)
+
+	// gen1: pool P = [a, b] at epoch e1. Fail a over so the sticky pointer
+	// settles on "b".
+	gen1 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("P", e1, "a", "b")}, nil, shared)
+	if got, _ := gen1.ResolveActive("P"); got != "a" {
+		t.Fatalf("gen1 initial = %q, want a", got)
+	}
+	gen1.MarkCooldownScoped("a", "P", e1, time.Now().Add(300*time.Second), "failover:429")
+	if got, _ := gen1.ResolveActive("P"); got != "b" {
+		t.Fatalf("gen1 after cooling a = %q, want b", got)
+	}
+
+	// P is removed and RE-CREATED with the SAME name but a DIFFERENT member
+	// order [b, a] at a strictly greater epoch e2. "b" still exists by name
+	// but at epoch e2 / position 0; the stored sticky entry (member "b",
+	// epoch e1) must NOT be honored — the new generation must pick by fresh
+	// position order, which is "b" at position 0. To make the test prove the
+	// epoch check (not just coincide), the new order is [c, b]: position 0 is
+	// "c". A name-only sticky pointer would wrongly return "b"; the
+	// epoch-scoped pointer rejects the stale entry and returns fresh
+	// position-0 "c".
+	gen2 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("P", e2, "c", "b")}, nil, shared)
+	gen2.MergeLiveCooldowns(gen1)
+	if got, ok := gen2.ResolveActive("P"); !ok || got != "c" {
+		t.Fatalf("Finding 3: stale-epoch sticky entry honored; got %q,%v want c,true "+
+			"(epoch-bumped re-create must NOT inherit the old generation's sticky member)", got, ok)
+	}
+}
+
+// TestResolveActiveFastPathAndAdvancePath exercises both Finding 2 lock
+// paths for correctness: the RLock sticky-hold fast path (no mutation) and
+// the write-lock advance path. It is a correctness test, not a benchmark.
+func TestResolveActiveFastPathAndAdvancePath(t *testing.T) {
+	shared := NewPoolHealth()
+	pr := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared)
+
+	// First call: no sticky entry -> advance/init path (write lock), settles a.
+	if got, _ := pr.ResolveActive("pool"); got != "a" {
+		t.Fatalf("init = %q, want a", got)
+	}
+	// Subsequent calls: sticky hold -> RLock fast path, must keep returning a.
+	for i := 0; i < 100; i++ {
+		if got, _ := pr.ResolveActive("pool"); got != "a" {
+			t.Fatalf("fast-path call %d = %q, want a (sticky hold)", i, got)
+		}
+	}
+	// Cool a -> next call takes the advance (write-lock) path and moves to b.
+	pr.MarkCooldown("a", time.Now().Add(120*time.Second), "429")
+	if got, _ := pr.ResolveActive("pool"); got != "b" {
+		t.Fatalf("after cooling a = %q, want b (advance path)", got)
+	}
+	// Now b is the sticky hold: fast path again.
+	for i := 0; i < 100; i++ {
+		if got, _ := pr.ResolveActive("pool"); got != "b" {
+			t.Fatalf("fast-path call %d after advance = %q, want b", i, got)
+		}
+	}
+	// Concurrent resolves under the read-mostly pattern must all agree and be
+	// race-clean (run under -race).
+	var wg sync.WaitGroup
+	for i := 0; i < 64; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := 0; j < 200; j++ {
+				if got, _ := pr.ResolveActive("pool"); got != "b" {
+					t.Errorf("concurrent resolve = %q, want b", got)
+					return
+				}
+			}
+		}()
+	}
+	wg.Wait()
+}
+
+// TestSetCurrentMembersPrunesStaleActive pins Finding 1: SetCurrentMembers is
+// a live reload path (NewPoolResolverShared calls it before StorePool reaches
+// MergeLiveCooldowns), so it must itself prune a stale sticky entry. This
+// drives SetCurrentMembers directly with a member set that drops the pool's
+// recorded member and asserts the sticky pointer is gone.
+func TestSetCurrentMembersPrunesStaleActive(t *testing.T) {
+	shared := NewPoolHealth()
+	pr := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared)
+	if got, _ := pr.ResolveActive("pool"); got != "a" {
+		t.Fatalf("init = %q, want a", got)
+	}
+	// New generation: pool no longer contains "a" (membership changed). Drive
+	// SetCurrentMembers directly with the new member set (b only, epoch 2).
+	shared.SetCurrentMembers(map[string]memberIdentity{
+		"b": {pool: "pool", epoch: 2},
+	})
+	shared.mu.RLock()
+	_, stillActive := shared.active["pool"]
+	shared.mu.RUnlock()
+	if stillActive {
+		t.Fatal("Finding 1: SetCurrentMembers did not prune the stale sticky pointer for a dropped/epoch-bumped member")
+	}
+}

From ee609ab8248fcbcccaca5dcc8b2d5e08bc6eaeaa Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 21:21:51 +0800
Subject: [PATCH 7/8] perf(vault): single time.Now snapshot per ResolveActive
 and drop write lock before degrade logging

---
 internal/vault/pool.go | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/internal/vault/pool.go b/internal/vault/pool.go
index fbce069..d818fc3 100644
--- a/internal/vault/pool.go
+++ b/internal/vault/pool.go
@@ -401,10 +401,20 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 	// 3). identity is immutable for this resolver, so it is read lock-free.
 	genEpoch := pr.identity[members[0]].epoch
 
+	// Finding 1: one time snapshot for the whole resolve. cooling() runs once
+	// per member in the scan loops; calling time.Now() per invocation made the
+	// cooldown gate observe a drifting clock within a single resolve (and was
+	// needless syscall churn). Capture now ONCE here so the RLock fast path and
+	// the write-lock advance/degrade slow path evaluate every member against
+	// one coherent instant. Semantics are unchanged: a member is cooling iff
+	// cooldownUntil.After(now).
+	now := time.Now()
+
 	cooling := func(m string) bool {
-		// Caller holds ph.mu (R or W).
+		// Caller holds ph.mu (R or W). Compared against the resolve-wide `now`
+		// snapshot (Finding 1), not a fresh time.Now() per call.
 		h, tracked := pr.health.health[m]
-		return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(time.Now())
+		return tracked && !h.cooldownUntil.IsZero() && h.cooldownUntil.After(now)
 	}
 
 	// Read-mostly fast path (Finding 2): the common case is a sticky hold —
@@ -435,7 +445,14 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 	// the cooldown view a concurrent failover's MarkCooldown writes (same mu,
 	// no second lock, no lock-ordering hazard).
 	pr.health.mu.Lock()
-	defer pr.health.mu.Unlock()
+	// Finding 3: the write lock is held ONLY for the sticky-pointer
+	// re-check, the advance mutation, and the degrade-target selection.
+	// Each slow-path exit Unlocks EXPLICITLY before its return/log.Printf
+	// (no `defer`), so the lock never spans logging or the value return and
+	// concurrent ResolveActive / MarkCooldown are not serialized behind log
+	// I/O. Every path below unlocks exactly once before returning; there is
+	// no path that returns while still holding the lock and none unlocks
+	// twice.
 
 	// Position of the current sticky member in THIS generation's member
 	// list. startIdx == -1 (no valid current) makes the scan start at
@@ -454,6 +471,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 			// advanced here, or it was a benign RLock->Lock race. Keep
 			// serving it; do not move.
 			if !cooling(ent.member) {
+				pr.health.mu.Unlock()
 				return ent.member, true
 			}
 			startIdx = i
@@ -470,6 +488,7 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 		m := members[idx]
 		if !cooling(m) {
 			pr.health.active[name] = activeEntry{member: m, epoch: genEpoch}
+			pr.health.mu.Unlock()
 			return m, true
 		}
 	}
@@ -498,13 +517,22 @@ func (pr *PoolResolver) ResolveActive(name string) (member string, ok bool) {
 			}
 		}
 	}
+	// Finding 3: the degrade TARGET (and the values the WARNING needs) are
+	// fully computed above under the write lock; the sticky pointer is
+	// deliberately NOT moved in the degrade case. Release the lock here,
+	// BEFORE the log.Printf and return, so logging never serializes
+	// concurrent ResolveActive / MarkCooldown. memberCount/name/soonest*
+	// are locals captured under the lock; the post-Unlock log/return only
+	// reads them, never the shared map.
+	memberCount := len(members)
+	pr.health.mu.Unlock()
 	if soonestParked != "" {
 		log.Printf("[POOL] all %d members of pool %q are cooling; degrading to operator-parked-but-healthy %q",
-			len(members), name, soonestParked)
+			memberCount, name, soonestParked)
 		return soonestParked, true
 	}
 	log.Printf("[POOL] all %d members of pool %q are in cooldown; degrading to %q (recovers %s)",
-		len(members), name, soonest, soonestUntil.Format(time.RFC3339))
+		memberCount, name, soonest, soonestUntil.Format(time.RFC3339))
 	return soonest, true
 }
 

From 57e72e68f4404644cac9007f086bacf9e99da11e Mon Sep 17 00:00:00 2001
From: Nikita Nemirovsky <vaze.legend@gmail.com>
Date: Mon, 18 May 2026 21:21:58 +0800
Subject: [PATCH 8/8] test(vault): use realistic epoch>=1 in sticky-pointer
 tests and add concurrent degrade no-double-unlock test

---
 internal/vault/pool_test.go | 69 ++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/internal/vault/pool_test.go b/internal/vault/pool_test.go
index 958bc59..1318223 100644
--- a/internal/vault/pool_test.go
+++ b/internal/vault/pool_test.go
@@ -189,7 +189,13 @@ func ManualRotateCooldownForTest() time.Duration { return 300 * time.Second }
 // it back to position 0.
 func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) {
 	shared := NewPoolHealth()
-	gen1 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared)
+	// Realistic epoch >= 1 (Finding 2): production membership rows are stamped
+	// with a monotonic epoch >= 1; epoch 0 is documented as "never live", so a
+	// zero-epoch "live" generation is unrealistic and could mask an
+	// epoch-guard regression. All generations here rebuild the SAME pool
+	// (membership unchanged) so they share epoch 1 — the sticky pointer must
+	// survive the swap precisely because the epoch still matches.
+	gen1 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared)
 	if got, _ := gen1.ResolveActive("pool"); got != "a" {
 		t.Fatalf("gen1 initial = %q, want a", got)
 	}
@@ -201,8 +207,10 @@ func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) {
 	// "a" recovers (durable write may not have landed).
 	gen1.MarkCooldown("a", time.Time{}, "")
 
-	// Reload: fresh generation, SAME shared health, store has no rows.
-	gen2 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b")}, nil, shared)
+	// Reload: fresh generation, SAME shared health, store has no rows. Same
+	// pool/membership -> same epoch 1, so the sticky entry's epoch still
+	// matches and the pointer must survive (not snap back).
+	gen2 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b")}, nil, shared)
 	gen2.MergeLiveCooldowns(gen1)
 	// Sticky pointer survived the swap: gen2 keeps serving "b", not "a".
 	if got, _ := gen2.ResolveActive("pool"); got != "b" {
@@ -220,15 +228,66 @@ func TestResolveActiveStickyPointerSurvivesRebuildAndSwap(t *testing.T) {
 
 	// A pool dropped entirely prunes its sticky pointer (mirrors cooldown
 	// prune) so a re-add does not inherit a stale active member.
-	gen3 := NewPoolResolverShared([]store.Pool{mkPool("other", "x")}, nil, shared)
+	gen3 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("other", 1, "x")}, nil, shared)
 	gen3.MergeLiveCooldowns(gen2)
-	gen4 := NewPoolResolverShared([]store.Pool{mkPool("pool", "a", "b"), mkPool("other", "x")}, nil, shared)
+	gen4 := NewPoolResolverShared([]store.Pool{mkPoolEpoch("pool", 1, "a", "b"), mkPoolEpoch("other", 1, "x")}, nil, shared)
 	gen4.MergeLiveCooldowns(gen3)
 	if got, _ := gen4.ResolveActive("pool"); got != "a" {
 		t.Fatalf("re-added pool active = %q, want a (dropped pool's sticky pointer must be pruned)", got)
 	}
 }
 
+// TestResolveActiveConcurrentDegradeNoDoubleUnlock pins Finding 3: the slow
+// path now Unlocks the write lock EXPLICITLY (no defer) before every
+// return/log.Printf, including the all-cooling degrade exit. A
+// double-unlock or unlock-of-rlock would panic/race; a missed unlock would
+// deadlock. Drive many concurrent ResolveActive calls with EVERY member
+// cooling (forces the degrade exit every time) and assert the result is
+// stable and the run is race-clean (run under -race). Behavior is unchanged:
+// the soonest-recovering member is still returned and the sticky pointer is
+// not moved.
+func TestResolveActiveConcurrentDegradeNoDoubleUnlock(t *testing.T) {
+	now := time.Now()
+	health := []store.CredentialHealth{
+		{Credential: "a", Status: "cooldown", CooldownUntil: now.Add(300 * time.Second), LastFailureReason: "401"},
+		{Credential: "b", Status: "cooldown", CooldownUntil: now.Add(30 * time.Second), LastFailureReason: "429"},
+	}
+	pr := NewPoolResolver([]store.Pool{mkPool("pool", "a", "b")}, health)
+
+	var wg sync.WaitGroup
+	for i := 0; i < 64; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := 0; j < 300; j++ {
+				got, ok := pr.ResolveActive("pool")
+				if !ok || got != "b" {
+					t.Errorf("degrade resolve = %q,%v; want b,true (soonest)", got, ok)
+					return
+				}
+			}
+		}()
+	}
+	wg.Wait()
+	// Concurrent MarkCooldown vs degrade ResolveActive: same mu, must be
+	// race-clean and never deadlock after the explicit-unlock restructure.
+	var wg2 sync.WaitGroup
+	wg2.Add(2)
+	go func() {
+		defer wg2.Done()
+		for j := 0; j < 500; j++ {
+			pr.ResolveActive("pool")
+		}
+	}()
+	go func() {
+		defer wg2.Done()
+		for j := 0; j < 500; j++ {
+			pr.MarkCooldown("a", time.Now().Add(300*time.Second), "401")
+		}
+	}()
+	wg2.Wait()
+}
+
 func TestResolveActivePassthroughForNonPool(t *testing.T) {
 	pr := NewPoolResolver(nil, nil)
 	got, ok := pr.ResolveActive("plain_cred")