From a07081eac80d908a1dac8f779c61c16184bdd2a2 Mon Sep 17 00:00:00 2001 From: Simone Tiraboschi Date: Thu, 30 Apr 2026 11:30:23 +0200 Subject: [PATCH] feat: add KubevirtMigrationAware evictor plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new EvictorPlugin that makes the descheduler aware of KubeVirt live-migration state when deciding whether to evict virt-launcher pods. Filter (hard block): prevents eviction of pods whose VMI has a migration in progress (startTimestamp set, endTimestamp absent in migrationState). KubeVirt's own admission webhook provides a complementary safety net at the API layer; this plugin acts upstream of it to avoid the round-trip. PreEvictionFilter (soft block): defers eviction of pods whose VMI completed a migration recently, using a three-layer adaptive cooldown: 1. Base: max(migrationCooldown, migrationDuration) — heavier VMs (longer migrations) automatically receive longer protection. 2. Backoff: base × 2^(count−1) where count is the number of migration completions recorded in a configurable sliding history window (default 24h). Each successive migration within the window doubles the cooldown, making repeated churn progressively harder. 3. Cap: the result is bounded by maxMigrationCooldown (default 6h) to prevent pathological cases from locking a VM indefinitely. Defaults: migrationCooldown=15m, maxMigrationCooldown=6h, migrationHistoryWindow=24h. All three are operator-configurable. Both extension points read from a dedicated dynamic VMI informer cache (kubevirt.io/v1 VirtualMachineInstances), avoiding API-server calls in the hot eviction path. An UpdateFunc event handler on the same informer records migration completions by VMI UID to drive the backoff history. The cache warms up at startup with a 30s timeout; failure to sync is a hard error so the descheduler does not start with stale or empty state. Two Prometheus metrics are registered on first use: - descheduler_kubevirt_eviction_blocks_total{reason,node,namespace} counter — tracks eviction blocks for alerting and per-node diagnosis. - descheduler_kubevirt_effective_cooldown_seconds histogram — shows the distribution of applied cooldown durations across backoff buckets (15m, 30m, 1h, 2h, 4h, 6h) so operators can tell whether the backoff is engaging or VMs are piling up at the cap. All code paths that cannot retrieve or parse VMI state fail open (allow eviction) so the plugin never blocks unrelated workloads. Unit tests cover Filter, PreEvictionFilter (base cooldown, adaptive duration, exponential backoff, maxMigrationCooldown cap), migration history recording and pruning, informer event handler, defaults, and validation. No kubevirt imports are required: VMI state is expressed as plain *unstructured.Unstructured objects, exactly as the dynamic informer delivers them at runtime. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Simone Tiraboschi --- pkg/descheduler/setupplugins.go | 2 + .../plugins/kubevirtmigrationaware/README.md | 535 ++++++++++++++++ .../kubevirtmigrationaware/defaults.go | 47 ++ .../kubevirtmigrationaware/defaults_test.go | 55 ++ .../plugins/kubevirtmigrationaware/doc.go | 19 + .../kubevirtmigrationaware.go | 408 ++++++++++++ .../kubevirtmigrationaware_test.go | 592 ++++++++++++++++++ .../kubevirtmigrationaware/register.go | 31 + .../plugins/kubevirtmigrationaware/types.go | 49 ++ .../kubevirtmigrationaware/validation.go | 41 ++ .../kubevirtmigrationaware/validation_test.go | 114 ++++ .../zz_generated.deepcopy.go | 54 ++ .../zz_generated.defaults.go | 33 + 13 files changed, 1980 insertions(+) create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/README.md create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/defaults.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/doc.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/register.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/types.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/validation.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/validation_test.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go create mode 100644 pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go diff --git a/pkg/descheduler/setupplugins.go b/pkg/descheduler/setupplugins.go index eff3d049ae..59c67f5b10 100644 --- a/pkg/descheduler/setupplugins.go +++ b/pkg/descheduler/setupplugins.go @@ -19,6 +19,7 @@ package descheduler import ( "sigs.k8s.io/descheduler/pkg/framework/pluginregistry" "sigs.k8s.io/descheduler/pkg/framework/plugins/defaultevictor" + "sigs.k8s.io/descheduler/pkg/framework/plugins/kubevirtmigrationaware" "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization" "sigs.k8s.io/descheduler/pkg/framework/plugins/podlifetime" "sigs.k8s.io/descheduler/pkg/framework/plugins/removeduplicates" @@ -47,4 +48,5 @@ func RegisterDefaultPlugins(registry pluginregistry.Registry) { pluginregistry.Register(removepodsviolatingnodeaffinity.PluginName, removepodsviolatingnodeaffinity.New, &removepodsviolatingnodeaffinity.RemovePodsViolatingNodeAffinity{}, &removepodsviolatingnodeaffinity.RemovePodsViolatingNodeAffinityArgs{}, removepodsviolatingnodeaffinity.ValidateRemovePodsViolatingNodeAffinityArgs, removepodsviolatingnodeaffinity.SetDefaults_RemovePodsViolatingNodeAffinityArgs, registry) pluginregistry.Register(removepodsviolatingnodetaints.PluginName, removepodsviolatingnodetaints.New, &removepodsviolatingnodetaints.RemovePodsViolatingNodeTaints{}, &removepodsviolatingnodetaints.RemovePodsViolatingNodeTaintsArgs{}, removepodsviolatingnodetaints.ValidateRemovePodsViolatingNodeTaintsArgs, removepodsviolatingnodetaints.SetDefaults_RemovePodsViolatingNodeTaintsArgs, registry) pluginregistry.Register(removepodsviolatingtopologyspreadconstraint.PluginName, removepodsviolatingtopologyspreadconstraint.New, &removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraint{}, &removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraintArgs{}, removepodsviolatingtopologyspreadconstraint.ValidateRemovePodsViolatingTopologySpreadConstraintArgs, removepodsviolatingtopologyspreadconstraint.SetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs, registry) + pluginregistry.Register(kubevirtmigrationaware.PluginName, kubevirtmigrationaware.New, &kubevirtmigrationaware.KubevirtMigrationAware{}, &kubevirtmigrationaware.KubevirtMigrationAwareArgs{}, kubevirtmigrationaware.ValidateKubevirtMigrationAwareArgs, kubevirtmigrationaware.SetDefaults_KubevirtMigrationAwareArgs, registry) } diff --git a/pkg/framework/plugins/kubevirtmigrationaware/README.md b/pkg/framework/plugins/kubevirtmigrationaware/README.md new file mode 100644 index 0000000000..60e785a306 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/README.md @@ -0,0 +1,535 @@ +# KubevirtMigrationAware + +An `EvictorPlugin` that makes the descheduler aware of KubeVirt live-migration +state when deciding whether to evict `virt-launcher` pods. It prevents +evictions during active migrations and applies a self-tuning cooldown after +migrations complete, reducing per-VM churn without modifying any other part of +the descheduler. + +--- + +## 1. Problem + +### 1.1 The descheduler rebalances by evicting pods — but VMs are not pods + +The descheduler measures node utilisation, identifies outlier nodes, and evicts +pods from overloaded nodes so that the scheduler can place them somewhere +better. For regular stateless workloads this is harmless: the pod restarts +quickly on a new node and the cluster converges. + +For KubeVirt virtual machines the same eviction triggers a **live migration**. +The `virt-launcher` pod is evicted, KubeVirt moves the VM's memory and CPU +state to a destination node, and a new `virt-launcher` pod appears there. The +migration itself consumes CPU, memory bandwidth, and network capacity on both +source and destination nodes — resources that are visible to the very metrics +the descheduler uses to decide its next move. + +### 1.2 Per-VM churn: the same VM migrated repeatedly + +Without any awareness of migration state, the descheduler can evict the same +VM multiple times in quick succession: + +1. VM is on node A (overloaded) → descheduler evicts it → migration starts. +2. Migration completes; VM is now on node B. +3. Node B's utilisation rises because the VM just landed there and is warming + up its CPU caches and memory working set. +4. Node B now looks like an outlier → descheduler evicts the VM again. +5. Repeat. + +This **churn loop** produces more migrations than the cluster needs, degrades +VM performance, and can prevent the cluster from ever reaching a stable state. + +### 1.3 Non-convergence at cluster scale + +A subtler and harder problem arises when the descheduler's outlier detection +uses cluster-relative thresholds. If all nodes run at sustained moderate load, +comparing nodes against each other always produces outliers — the cluster looks +imbalanced even when it is as balanced as the workload allows. + +Under these conditions the descheduler keeps evicting. Measurements on a +300-VM cluster running a sustained CPU/RAM stress profile showed roughly +**one eviction per VM per day** even after the cluster was already +post-rebalance. Raising the outlier margin from 10% to 20% cuts the volume +significantly; this plugin further reduces harm by rate-limiting how often any +individual VM can be evicted — but it does not change the fundamental +convergence property of the outlier algorithm. + +**Important:** this plugin addresses *per-VM churn* (the same VM migrated +repeatedly). It does not reduce *total migration volume*: if 50 VMs are in +cooldown, the descheduler will target the other 50. For total-volume +reduction, raise the outlier threshold in your descheduler profile. + +--- + +## 2. Architecture + +### 2.1 The descheduler is VM-blind by default + +The descheduler only watches `Pod` objects. It has no built-in concept of +VirtualMachineInstances (VMIs), migration state, or whether an eviction will +trigger a live migration or a simple pod restart. + +To protect VMs we need the descheduler to consult KubeVirt's VMI status before +evicting. This plugin provides that bridge. + +### 2.2 A dedicated VMI informer — no API-server calls in the hot path + +The plugin creates its own `DynamicSharedInformerFactory` that watches +`virtualmachineinstances.kubevirt.io/v1` across all namespaces. VMI objects +are stored in a local in-memory cache (a standard `client-go` informer store). + +Every call to `Filter` or `PreEvictionFilter` reads from this cache — no +API-server round-trip in the eviction hot path. The cache warms up at plugin +startup with a 30-second timeout; if the VMI informer cannot sync (e.g. +KubeVirt is not installed), the plugin fails fast and prevents the descheduler +from starting. + +The plugin also registers an `UpdateFunc` event handler on the same informer. +When a VMI's `status.migrationState.endTimestamp` transitions to a new +non-empty value — meaning a migration just completed — the handler records the +event in an in-memory history map keyed by VMI UID. This history drives the +exponential backoff described in §3.2. + +The link between a `virt-launcher` pod and its VMI is the annotation +`kubevirt.io/domain` that KubeVirt sets on every `virt-launcher` pod. Its +value is the VMI name. Pods without this annotation are not `virt-launcher` +pods and pass through both extension points unchanged. + +### 2.3 Two extension points: hard block vs. soft defer + +The descheduler's evictor pipeline exposes two distinct hooks, and the plugin +uses both for different purposes. + +**`Filter` — hard block.** Called during candidate selection. Returning +`false` removes the pod from the eviction candidate set entirely for this +descheduler cycle. The plugin uses this to block eviction of any +`virt-launcher` pod whose VMI has a migration actively in progress +(`startTimestamp` present, `endTimestamp` absent in `migrationState`). + +KubeVirt's `virt-api` already provides a complementary safety net: a +validating admission webhook that intercepts eviction requests and rejects +them when a migration is already in progress for that VM. Our `Filter` +acts upstream of that — it prevents the eviction attempt from being issued +at all, avoiding the API round-trip. In a distributed environment where +concurrent control loops may race, KubeVirt's webhook remains the +authoritative last line of defence; this plugin is defence-in-depth, not a +replacement. + +**`PreEvictionFilter` — soft defer.** Called immediately before each +individual eviction is issued. Returning `false` skips this pod and lets the +eviction loop try the next candidate on the same node. The plugin uses this to +apply the cooldown logic described in §3: if the VM migrated recently and the +cooldown has not expired, the eviction is deferred rather than hard-blocked, +giving other pods on the same node a chance to be evicted instead. + +The practical difference: `Filter` stops a pod from being a candidate at all; +`PreEvictionFilter` lets the loop skip a specific pod and try others. Both +must be enabled in the descheduler profile (see §6). + +--- + +## 3. Cooldown Logic + +### Two protection tiers with different durability + +Before describing the individual layers it is important to understand that the +cooldown logic has two fundamentally different durability tiers: + +**Tier A — VMI-persisted (survives descheduler pod restarts).** +Layer 1 reads `startTimestamp` and `endTimestamp` directly from +`status.migrationState` on the VMI object. KubeVirt writes and owns this +field; it persists on the VMI regardless of what happens to the descheduler +pod. A rolling update, an OOM kill, or a node eviction of the descheduler pod +does not erase it. Critically, this tier also captures the *cost* of the +migration: the difference between the two timestamps tells the plugin how long +that VM took to migrate, which directly raises the cooldown for expensive VMs. + +**Tier B — in-memory only (lost on descheduler pod restart).** +Layer 2 keeps a sliding-window history of migration-completion events in the +plugin's process memory. This drives the exponential backoff: a VM that +migrates repeatedly within the window gets a progressively longer cooldown. +The history is populated by informer events at runtime and is not persisted +anywhere. If the descheduler pod restarts the history is reset, and VMs that +had accumulated backoff appear clean again — see §8 for the operational +implications. + +The two tiers complement each other: Tier A provides a baseline guarantee that +is always present and restart-safe; Tier B adds stronger churn resistance for +VMs that are actively churning and its absence after a restart is typically +short-lived because the in-memory history rebuilds as migrations continue. + +--- + +The cooldown is computed in three layers applied in order. Each layer can only +increase the effective cooldown; none can reduce it below the previous layer's +result. + +### 3.1 Layer 1 — base adaptive cooldown + +``` +effectiveCooldown = max(migrationCooldown, migrationDuration) +``` + +`migrationCooldown` is the operator-configured minimum (default **15 minutes**). +`migrationDuration` is `endTimestamp − startTimestamp` read from the VMI's +`status.migrationState`. + +The `max` means that heavier VMs — ones that take a long time to migrate +because they have large memory footprints or high dirty-page rates — receive +proportionally longer protection automatically, without any manual per-VM +configuration. + +**Examples with default `migrationCooldown = 15m`:** + +| VM type | Migration duration | Effective cooldown (layer 1) | +|---|---|---| +| Small idle VM | 30 s | 15 m (configured floor dominates) | +| Medium VM | 10 m | 15 m (configured floor dominates) | +| Large memory VM | 25 m | 25 m (duration dominates) | +| Monster VM | 90 m | 90 m (duration dominates) | + +If `startTimestamp` is absent or malformed the migration duration cannot be +computed; the plugin falls back to `migrationCooldown` alone. + +### 3.2 Layer 2 — exponential backoff from migration history + +``` +effectiveCooldown = layer1Result × 2^(count − 1) (for count ≥ 1) +``` + +`count` is the number of migration completions recorded for this VMI in the +`migrationHistoryWindow` (default **24 hours**). The history is populated by +the informer event handler (§2.2), which fires each time a VMI's +`endTimestamp` changes — capturing every migration the cluster runs for that +VMI, not just descheduler-caused ones. + +The effect is that each successive migration within the window doubles the +cooldown, making it progressively harder to evict a VM that keeps getting +churned: + +**Examples with default `migrationCooldown = 15m`, small VM (duration < 15m):** + +| Migrations in last 24h | Effective cooldown (layer 2) | +|---:|---| +| 0 or 1 | 15 m | +| 2 | 30 m | +| 3 | 1 h | +| 4 | 2 h | +| 5 | 4 h | +| 6+ | capped by layer 3 | + +A VM that migrates once or twice recovers its normal cooldown naturally as old +history entries age out of the 24-hour window. A VM that migrates 5+ times in +a day is almost certainly in a churn loop; it gets progressively longer +protection until the loop breaks. + +> **Note on the race condition:** the informer event handler fires +> asynchronously. Between the moment a migration completes and the moment the +> descheduler's next cycle calls `PreEvictionFilter`, the event handler may or +> may not have recorded the completion yet. In the worst case `count` is +> under-reported by 1, meaning the backoff multiplier is 1× lower than +> expected for a single cycle. This is intentional and acceptable: the correct +> value is applied in the next cycle. + +### 3.3 Layer 3 — maximum cooldown cap + +``` +effectiveCooldown = min(layer2Result, maxMigrationCooldown) +``` + +`maxMigrationCooldown` (default **6 hours**) bounds the growth from both the +adaptive duration (layer 1) and the exponential backoff (layer 2). + +Without this cap, a VM with 8 migrations in 24 hours on a 15-minute base +cooldown would reach `15m × 2^7 = 32 h`, locking the VM for longer than the +history window itself. The cap ensures the descheduler always has an +opportunity to re-evaluate after at most 6 hours, regardless of how severe the +churn history is. + +To disable the cap, set `maxMigrationCooldown: 0`. This is not recommended +with a long `migrationHistoryWindow` because backoff can grow unbounded. + +**Full worked example — large VM in a churn loop:** + +Assume: `migrationCooldown: 15m`, `maxMigrationCooldown: 6h`, +`migrationHistoryWindow: 24h`. The VM has a 30-minute migration duration. + +| Event | Time | count in 24h window | Layer 1 | Layer 2 | Layer 3 (cap) | +|---|---|---:|---|---|---| +| 1st migration completes | T+0 | 1 | 30 m | 30 m | 30 m | +| 2nd migration completes | T+31m | 2 | 30 m | 60 m | 60 m | +| 3rd migration completes | T+2h | 3 | 30 m | 2 h | 2 h | +| 4th migration completes | T+5h | 4 | 30 m | 4 h | 4 h | +| 5th migration completes | T+10h | 5 | 30 m | 8 h | **6 h** (capped) | +| 1st entry ages out at T+24h | T+25h | 4 | 30 m | 4 h | 4 h | + +The VM steps back down gradually as history entries age out — it does not +suddenly go from fully protected to fully evictable. + +--- + +## 4. Configuration Reference + +All fields are optional. Omitting a field (or setting it to `0`) causes the +default to apply. + +| Field | Default | Valid range | Description | +|---|---|---|---| +| `migrationCooldown` | `15m` | `≥ 0` | Minimum cooldown after any migration. `0` disables the configured floor; the adaptive duration (layer 1) still applies. | +| `maxMigrationCooldown` | `6h` | `≥ migrationCooldown` or `0` | Upper bound on the effective cooldown after all layers. `0` disables the cap. | +| `migrationHistoryWindow` | `24h` | `≥ 0` | Sliding window for migration-count history used by exponential backoff. `0` disables the window (no backoff). | + +**Validation rules:** +- `migrationCooldown` must be non-negative. +- `maxMigrationCooldown` must be non-negative. +- If both are non-zero, `maxMigrationCooldown ≥ migrationCooldown` (a cap + below the floor is a misconfiguration). +- `migrationHistoryWindow` must be non-negative. + +### Example configurations + +**Conservative — minimal interference, short memory:** +```yaml +migrationCooldown: 5m +maxMigrationCooldown: 1h +migrationHistoryWindow: 6h +``` +Suitable for clusters where workloads are expected to migrate frequently for +legitimate reasons (e.g. scheduled maintenance windows) and operators do not +want the backoff to accumulate. + +**Default — balanced protection:** +```yaml +# All defaults; these values are applied automatically when fields are omitted. +migrationCooldown: 15m +maxMigrationCooldown: 6h +migrationHistoryWindow: 24h +``` + +**Protective — strong churn resistance:** +```yaml +migrationCooldown: 30m +maxMigrationCooldown: 12h +migrationHistoryWindow: 48h +``` +Suitable for clusters with large memory-intensive VMs where each migration is +expensive and operators want the descheduler to back off aggressively after +repeated evictions. + +--- + +## 5. Profile Setup + +The plugin must be listed under **both** `filter` and `preEvictionFilter` in +the descheduler profile. Listing it under only one extension point silently +disables the other; there is no error. + +The `DefaultEvictor` is automatically injected by the descheduler and does not +need to appear in the `plugins` section. It does however need an explicit +`pluginConfig` entry if you want non-default behaviour. For KubeVirt workloads +`nodeFit: true` is strongly recommended: it makes the descheduler verify that a +suitable destination node exists before issuing an eviction, preventing a VM +from being evicted into a situation where the scheduler has nowhere valid to +place it. + +```yaml +apiVersion: "descheduler/v1alpha2" +kind: "DeschedulerPolicy" +profiles: + - name: KubevirtRelieveAndMigrate + pluginConfig: + - name: KubevirtMigrationAware + args: + migrationCooldown: 15m + maxMigrationCooldown: 6h + migrationHistoryWindow: 24h + - name: DefaultEvictor + args: + nodeFit: true # only evict when a valid destination node exists + - name: LowNodeUtilization + args: + thresholds: + MetricResource: 10 + targetThresholds: + MetricResource: 10 + useDeviationThresholds: true + plugins: + filter: + enabled: + - KubevirtMigrationAware # hard-blocks eviction during active migration + preEvictionFilter: + enabled: + - KubevirtMigrationAware # soft-defers eviction during cooldown + balance: + enabled: + - LowNodeUtilization +``` + +The RBAC for the descheduler's service account must include `list` and `watch` +on `virtualmachineinstances` in all namespaces: + +```yaml +- apiGroups: ["kubevirt.io"] + resources: ["virtualmachineinstances"] + verbs: ["list", "watch"] +``` + +--- + +## 6. Observability + +### Metrics + +The plugin registers two Prometheus metrics. + +**`descheduler_kubevirt_eviction_blocks_total`** — counter + +Incremented each time the plugin prevents an eviction, labelled by `reason`, +`node`, and `namespace`. + +| Label | Values | Description | +|---|---|---| +| `reason` | `migration_in_progress` | Blocked by `Filter`: VMI has an active migration. | +| `reason` | `cooldown` | Blocked by `PreEvictionFilter`: VMI is within its cooldown window. | +| `node` | node name | The node the `virt-launcher` pod was scheduled on. | +| `namespace` | namespace name | The namespace of the `virt-launcher` pod and VMI. | + +**`descheduler_kubevirt_effective_cooldown_seconds`** — histogram + +Recorded each time a cooldown block is applied, capturing the effective +cooldown duration in seconds. Bucket boundaries correspond to the exponential +backoff steps under default configuration: +`900` (15 m), `1800` (30 m), `3600` (1 h), `7200` (2 h), `14400` (4 h), +`21600` (6 h). + +If most observations land in the `900` bucket, the plugin is applying only the +base cooldown — backoff is not engaging significantly. If observations shift +toward `21600`, many VMs are hitting the cap, indicating sustained churn that +warrants operator attention. + +### Useful PromQL queries + +**Is the plugin actively blocking evictions on any node?** +```promql +rate(descheduler_kubevirt_eviction_blocks_total[10m]) > 0 +``` + +**Which nodes are seeing the most cooldown blocks over the last hour?** +```promql +topk(10, + increase(descheduler_kubevirt_eviction_blocks_total{reason="cooldown"}[1h]) +) +``` + +**Are VMs hitting the cap (6 h bucket) — sign of a churn loop?** +```promql +increase(descheduler_kubevirt_effective_cooldown_seconds_bucket{le="21600"}[1h]) +/ +increase(descheduler_kubevirt_effective_cooldown_seconds_count[1h]) +``` +A ratio close to 1.0 means most blocked evictions are at or below 6 hours. +A low ratio means many blocks are coming from the base 15-minute cooldown — +normal expected behaviour. + +**Suggested alert — sustained blocking on a single node:** +```promql +rate(descheduler_kubevirt_eviction_blocks_total{reason="cooldown"}[30m]) > 0.1 +``` +This fires if a node is seeing more than one cooldown block every ~10 minutes +over a 30-minute window, which may indicate that all VMs on that node are in +cooldown and the descheduler cannot rebalance it. + +--- + +## 7. Scheduler–Descheduler Decoupling + +The descheduler and the scheduler are independent, stateless components with +no shared state. When the descheduler evicts a `virt-launcher` pod from an +overloaded node it is making a bet: it hopes the scheduler will place the new +pod somewhere better, but it has no control over — and no visibility into — +where that placement will actually land. + +The **soft-tainter** (a separate operator component) tries to close this +information gap by applying `PreferNoSchedule` taints to overloaded nodes, +nudging the scheduler away from them. This works well for most workloads, but +a VM that carries specific scheduling constraints — `nodeSelector`, `nodeAffinity`, +pod affinity rules, or a narrow set of tolerated taints — can bypass +`PreferNoSchedule` entirely and land back on a suboptimal node regardless. + +When that happens this plugin's cooldown mechanism provides a backstop: the VM +that just migrated onto the wrong node will not be immediately re-evicted in a +tight loop. The exponential backoff described in §3.2 makes each successive +eviction of the same VM progressively harder, giving the cluster time to +stabilise or for an operator to intervene. + +However, the cooldown only protects the specific VM that landed badly. The +descheduler still sees an overloaded node and will continue to evict *other* +eligible VMs from it, which may or may not improve the situation depending on +their scheduling constraints. The fundamental fix for mis-placed VMs is +correct scheduling configuration, not rate-limiting. + +--- + +## 8. Known Limitations + +**Migration history (Tier B) is in-memory and lost on restart.** +The 24-hour migration history that drives exponential backoff (§3, Tier B) is +stored in the plugin's process memory. If the descheduler pod is restarted — +rolling update, OOM kill, node eviction — the history is reset. VMs that had +accumulated backoff appear clean again, and the descheduler may trigger a burst +of migrations immediately after restart. + +The VMI-persisted state (§3, Tier A) — the last migration's start and end +timestamps on the VMI object — is unaffected by a descheduler restart and +continues to enforce the base adaptive cooldown. The window of vulnerability +is therefore bounded: the per-VM base cooldown (layer 1) remains intact; only +the churn-history multiplier (layer 2) is lost until the in-memory history +rebuilds. + +**All migrations are counted, not only descheduler-caused ones.** +The informer handler fires on every `endTimestamp` transition, including +migrations triggered by node drain, KubeVirt's own resource management, or +manual operator actions. A VM that was legitimately drained for hardware +maintenance will enter the backoff history alongside descheduler-driven +evictions. This is conservative: the plugin may protect a VM that does not +strictly need protection, but it will never fail to protect one that does. + +**Cooldown protects individual VMs but does not reduce total migration volume.** +If 50 VMs on a cluster are in cooldown, the descheduler selects the other 50 +as candidates. The total number of migrations on the cluster does not decrease; +the benefit is that no single VM is churned repeatedly. For clusters where the +outlier threshold keeps finding outliers under sustained load, raising the +outlier margin is a more effective lever than this plugin alone. + +**Mixed VM sizes can produce structural non-convergence.** +In a cluster with heterogeneous VM sizes (e.g. 144 × 1 Gi VMs and 1 × 64 Gi +VM), the descheduler may never reach a stable state: migrating the large VM +sharply changes utilisation on both source and destination, which can flip +which nodes satisfy the outlier predicate on the next cycle, causing +back-and-forth movement. This plugin's adaptive cooldown gives the large VM +more protection (its migration takes longer, so its layer-1 cooldown is +larger), but the surrounding small VMs are still subject to ongoing eviction. +True convergence in this scenario requires VM-size-aware eviction selection, +which is outside the scope of this plugin. + +--- + +## 9. Fail-Open Contract + +Every code path that cannot retrieve or parse VMI state allows the eviction to +proceed rather than blocking it. Specifically: + +- Pod has no `kubevirt.io/domain` annotation → not a `virt-launcher` pod → pass through. +- VMI not found in informer cache (cache miss, different namespace, VMI deleted) → pass through. +- VMI object is not of type `*unstructured.Unstructured` → pass through. +- `startTimestamp` or `endTimestamp` is absent or not valid RFC 3339 → treat as no migration record → pass through. + +The plugin never hard-fails in a way that would prevent evictions of unrelated +workloads. If KubeVirt is uninstalled after the plugin starts, the informer +cache goes stale but continues to serve the last known state; cache misses for +new VMIs fail open. + +The only hard failure is at startup: if the VMI informer cache does not sync +within 30 seconds, the plugin returns an error and the descheduler does not +start. This is intentional — operating with a permanently empty or stale cache +would silently remove all VM protections. diff --git a/pkg/framework/plugins/kubevirtmigrationaware/defaults.go b/pkg/framework/plugins/kubevirtmigrationaware/defaults.go new file mode 100644 index 0000000000..bed5900765 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/defaults.go @@ -0,0 +1,47 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +const ( + defaultMigrationCooldown = 15 * time.Minute + defaultMaxMigrationCooldown = 6 * time.Hour + defaultMigrationHistoryWindow = 24 * time.Hour +) + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +func SetDefaults_KubevirtMigrationAwareArgs(obj runtime.Object) { + args := obj.(*KubevirtMigrationAwareArgs) + if args.MigrationCooldown.Duration == 0 { + args.MigrationCooldown = metav1.Duration{Duration: defaultMigrationCooldown} + } + if args.MaxMigrationCooldown.Duration == 0 { + args.MaxMigrationCooldown = metav1.Duration{Duration: defaultMaxMigrationCooldown} + } + if args.MigrationHistoryWindow.Duration == 0 { + args.MigrationHistoryWindow = metav1.Duration{Duration: defaultMigrationHistoryWindow} + } +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go b/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go new file mode 100644 index 0000000000..6e03073fea --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go @@ -0,0 +1,55 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "testing" + "time" +) + +func TestSetDefaults(t *testing.T) { + t.Run("zero value gets all defaults", func(t *testing.T) { + args := &KubevirtMigrationAwareArgs{} + SetDefaults_KubevirtMigrationAwareArgs(args) + if args.MigrationCooldown.Duration != defaultMigrationCooldown { + t.Errorf("MigrationCooldown = %v, want %v", args.MigrationCooldown.Duration, defaultMigrationCooldown) + } + if args.MaxMigrationCooldown.Duration != defaultMaxMigrationCooldown { + t.Errorf("MaxMigrationCooldown = %v, want %v", args.MaxMigrationCooldown.Duration, defaultMaxMigrationCooldown) + } + if args.MigrationHistoryWindow.Duration != defaultMigrationHistoryWindow { + t.Errorf("MigrationHistoryWindow = %v, want %v", args.MigrationHistoryWindow.Duration, defaultMigrationHistoryWindow) + } + }) + + t.Run("explicit values are preserved", func(t *testing.T) { + args := &KubevirtMigrationAwareArgs{} + args.MigrationCooldown.Duration = 10 * time.Minute + args.MaxMigrationCooldown.Duration = 4 * time.Hour + args.MigrationHistoryWindow.Duration = 12 * time.Hour + SetDefaults_KubevirtMigrationAwareArgs(args) + if args.MigrationCooldown.Duration != 10*time.Minute { + t.Errorf("MigrationCooldown = %v, want 10m", args.MigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration != 4*time.Hour { + t.Errorf("MaxMigrationCooldown = %v, want 4h", args.MaxMigrationCooldown.Duration) + } + if args.MigrationHistoryWindow.Duration != 12*time.Hour { + t.Errorf("MigrationHistoryWindow = %v, want 12h", args.MigrationHistoryWindow.Duration) + } + }) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/doc.go b/pkg/framework/plugins/kubevirtmigrationaware/doc.go new file mode 100644 index 0000000000..d94f6d2c07 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +k8s:defaulter-gen=TypeMeta + +package kubevirtmigrationaware diff --git a/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go new file mode 100644 index 0000000000..c0ac3a3273 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go @@ -0,0 +1,408 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kubevirtmigrationaware provides an EvictorPlugin that prevents the +// descheduler from evicting virt-launcher pods while a VM live-migration is in +// progress (Filter) and suppresses re-eviction during a configurable cooldown +// period after the migration completes (PreEvictionFilter). +// +// Both extension points operate on the per-VMI migrationState recorded in the +// VMI status, which is kept in a local informer cache to avoid API-server load +// in the hot eviction path. +package kubevirtmigrationaware + +import ( + "context" + "fmt" + "sort" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + k8smetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + + frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" +) + +const ( + PluginName = "KubevirtMigrationAware" + + // virt-launcher pods carry this annotation with the VMI name as value. + vmiAnnotationKey = "kubevirt.io/domain" + + // VMI GVR in the kubevirt.io API group. + vmiGroup = "kubevirt.io" + vmiVersion = "v1" + vmiResource = "virtualmachineinstances" + + // Timeout for the initial informer cache sync at plugin startup. + cacheWarmupTimeout = 30 * time.Second + + reasonMigrationInProgress = "migration_in_progress" + reasonCooldown = "cooldown" +) + +var ( + vmiGVR = schema.GroupVersionResource{ + Group: vmiGroup, + Version: vmiVersion, + Resource: vmiResource, + } + + // evictionBlocksTotal counts how many times the plugin prevented an eviction, + // labelled by reason, node, and namespace. Use this to identify which nodes + // or tenant namespaces are experiencing repeated eviction gating. + evictionBlocksTotal = k8smetrics.NewCounterVec( + &k8smetrics.CounterOpts{ + Subsystem: "descheduler", + Name: "kubevirt_eviction_blocks_total", + Help: "Number of virt-launcher pod evictions blocked by KubevirtMigrationAware, by reason, node, and namespace.", + StabilityLevel: k8smetrics.ALPHA, + }, + []string{"reason", "node", "namespace"}, + ) + + // effectiveCooldownSeconds is a histogram of the adaptive cooldown durations + // applied when deferring evictions. Bucket boundaries match the exponential + // backoff steps with default configuration (15m base, 6h cap), so the + // distribution directly shows whether VMs are hitting the base cooldown or + // being pushed toward the cap by repeated migrations. + effectiveCooldownSeconds = k8smetrics.NewHistogram( + &k8smetrics.HistogramOpts{ + Subsystem: "descheduler", + Name: "kubevirt_effective_cooldown_seconds", + Help: "Distribution of effective cooldown durations applied when deferring virt-launcher pod evictions, in seconds.", + StabilityLevel: k8smetrics.ALPHA, + Buckets: []float64{900, 1800, 3600, 7200, 14400, 21600}, // 15m 30m 1h 2h 4h 6h + }, + ) + + registerMetricsOnce sync.Once +) + +// migrationHistory tracks per-VMI migration completion timestamps within a +// sliding window so that the plugin can apply exponential backoff to VMs that +// are migrated repeatedly. +type migrationHistory struct { + mu sync.Mutex + completions map[types.UID][]time.Time // sorted ascending; pruned lazily +} + +func newMigrationHistory() *migrationHistory { + return &migrationHistory{completions: make(map[types.UID][]time.Time)} +} + +// record appends a migration completion event for the given VMI. +func (h *migrationHistory) record(uid types.UID, t time.Time) { + h.mu.Lock() + defer h.mu.Unlock() + h.completions[uid] = append(h.completions[uid], t) +} + +// countAndPrune returns the number of migrations recorded for uid within +// window, pruning expired entries in the process. +func (h *migrationHistory) countAndPrune(uid types.UID, window time.Duration) int { + h.mu.Lock() + defer h.mu.Unlock() + ts := h.completions[uid] + if len(ts) == 0 { + return 0 + } + cutoff := time.Now().Add(-window) + i := sort.Search(len(ts), func(i int) bool { return ts[i].After(cutoff) }) + if i == len(ts) { + delete(h.completions, uid) + return 0 + } + if i > 0 { + h.completions[uid] = ts[i:] + } + return len(h.completions[uid]) +} + +// onVMIUpdate is the informer UpdateFunc handler. It records a migration +// completion when endTimestamp transitions to a new non-empty value. +func (h *migrationHistory) onVMIUpdate(oldObj, newObj interface{}) { + oldU, ok := oldObj.(*unstructured.Unstructured) + if !ok { + return + } + newU, ok := newObj.(*unstructured.Unstructured) + if !ok { + return + } + oldEnd, _, _ := unstructured.NestedString(oldU.Object, "status", "migrationState", "endTimestamp") + newEnd, _, _ := unstructured.NestedString(newU.Object, "status", "migrationState", "endTimestamp") + if newEnd == "" || newEnd == oldEnd { + return + } + t, err := time.Parse(time.RFC3339, newEnd) + if err != nil { + t = time.Now() + } + h.record(newU.GetUID(), t) +} + +// KubevirtMigrationAware is an EvictorPlugin. +// +// - Filter: hard-blocks eviction of virt-launcher pods whose VMI is +// currently mid-migration (startTimestamp set, endTimestamp absent). +// +// - PreEvictionFilter: soft-blocks eviction of virt-launcher pods whose VMI +// completed a migration within the configured MigrationCooldown window, +// allowing the eviction loop to skip and try other candidates instead. +type KubevirtMigrationAware struct { + logger klog.Logger + handle frameworktypes.Handle + args *KubevirtMigrationAwareArgs + vmiLister cache.GenericLister + history *migrationHistory +} + +var _ frameworktypes.EvictorPlugin = &KubevirtMigrationAware{} + +// New builds the plugin from its arguments. +// It creates a dedicated dynamic client and VMI informer so that Filter and +// PreEvictionFilter can read VMI state from a local cache instead of hitting +// the API server on every eviction decision. +func New(ctx context.Context, args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plugin, error) { + kmaArgs, ok := args.(*KubevirtMigrationAwareArgs) + if !ok { + return nil, fmt.Errorf("want args to be of type KubevirtMigrationAwareArgs, got %T", args) + } + + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("failed to build in-cluster REST config: %w", err) + } + + dynClient, err := dynamic.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("failed to create dynamic client: %w", err) + } + + // Create an all-namespaces informer factory with no re-sync (0 = disabled). + factory := dynamicinformer.NewDynamicSharedInformerFactory(dynClient, 0) + vmiGenericInformer := factory.ForResource(vmiGVR) + + history := newMigrationHistory() + if _, err = vmiGenericInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj interface{}) { + history.onVMIUpdate(oldObj, newObj) + }, + }); err != nil { + return nil, fmt.Errorf("failed to register VMI event handler: %w", err) + } + + factory.Start(ctx.Done()) + + syncCtx, cancel := context.WithTimeout(ctx, cacheWarmupTimeout) + defer cancel() + if !cache.WaitForCacheSync(syncCtx.Done(), vmiGenericInformer.Informer().HasSynced) { + return nil, fmt.Errorf("timed out waiting for VMI informer cache to sync (is KubeVirt installed?)") + } + + return newPlugin(ctx, kmaArgs, handle, vmiGenericInformer.Lister(), history) +} + +// newPlugin is the internal constructor used by both New (production) and tests. +// Tests call this directly with a pre-populated fake lister, bypassing the +// dynamic client and in-cluster config entirely. +func newPlugin(ctx context.Context, args *KubevirtMigrationAwareArgs, handle frameworktypes.Handle, lister cache.GenericLister, history *migrationHistory) (frameworktypes.Plugin, error) { + logger := klog.FromContext(ctx).WithValues("plugin", PluginName) + + registerMetricsOnce.Do(func() { + legacyregistry.MustRegister(evictionBlocksTotal, effectiveCooldownSeconds) + }) + + logger.V(2).Info("VMI lister ready", + "cooldown", args.MigrationCooldown.Duration, + "maxCooldown", args.MaxMigrationCooldown.Duration, + "historyWindow", args.MigrationHistoryWindow.Duration) + + return &KubevirtMigrationAware{ + logger: logger, + handle: handle, + args: args, + vmiLister: lister, + history: history, + }, nil +} + +// Name returns the plugin name. +func (k *KubevirtMigrationAware) Name() string { + return PluginName +} + +// Filter returns false (block eviction) for virt-launcher pods whose +// corresponding VMI is currently mid-migration. Non-virt-launcher pods and +// any VMI lookup failure are passed through (fail open). +func (k *KubevirtMigrationAware) Filter(pod *v1.Pod) bool { + uObj, ok := k.getVMI(pod) + if !ok { + return true + } + + if migrationInProgress(uObj) { + k.logger.V(3).Info("VMI migration in progress, blocking eviction", + "pod", klog.KObj(pod), "vmi", pod.Annotations[vmiAnnotationKey], "node", pod.Spec.NodeName) + evictionBlocksTotal.WithLabelValues(reasonMigrationInProgress, pod.Spec.NodeName, pod.Namespace).Inc() + return false + } + + return true +} + +// PreEvictionFilter returns false (defer eviction) for virt-launcher pods +// whose corresponding VMI completed a migration within the effective cooldown +// window. +// +// The effective cooldown is computed in three steps: +// 1. Base: max(MigrationCooldown, migration duration) — heavier VMs get longer +// protection automatically. +// 2. Exponential backoff: the base is doubled for each additional migration +// recorded in the 6-hour history window, so repeatedly migrated VMs are +// progressively protected against churn. +// 3. Cap: if MaxMigrationCooldown is non-zero, the result is capped there. +func (k *KubevirtMigrationAware) PreEvictionFilter(pod *v1.Pod) bool { + uObj, ok := k.getVMI(pod) + if !ok { + return true + } + + endTime, ok := migrationEndTime(uObj) + if !ok { + return true + } + + // Step 1: base = max(configured, migration duration). + effectiveCooldown := k.args.MigrationCooldown.Duration + if startTime, hasStart := migrationStartTime(uObj); hasStart { + if d := endTime.Sub(*startTime); d > effectiveCooldown { + effectiveCooldown = d + } + } + + // Step 2: double the cooldown for each migration beyond the first recorded + // in the history window. Uses an overflow-safe doubling loop. + count := k.history.countAndPrune(uObj.GetUID(), k.args.MigrationHistoryWindow.Duration) + for i := 1; i < count; i++ { + next := effectiveCooldown * 2 + if next/2 != effectiveCooldown { // int64 overflow guard + break + } + effectiveCooldown = next + if max := k.args.MaxMigrationCooldown.Duration; max > 0 && effectiveCooldown >= max { + break // cap will be applied in step 3; no point doubling further + } + } + + // Step 3: apply the optional upper bound. + if maxCooldown := k.args.MaxMigrationCooldown.Duration; maxCooldown > 0 && effectiveCooldown > maxCooldown { + effectiveCooldown = maxCooldown + } + + elapsed := time.Since(*endTime) + if elapsed < effectiveCooldown { + remaining := effectiveCooldown - elapsed + k.logger.V(3).Info("VMI in migration cooldown, deferring eviction", + "pod", klog.KObj(pod), "vmi", pod.Annotations[vmiAnnotationKey], "node", pod.Spec.NodeName, + "migrationCount", count, + "elapsed", elapsed.Round(time.Second), + "effectiveCooldown", effectiveCooldown.Round(time.Second), + "remaining", remaining.Round(time.Second)) + evictionBlocksTotal.WithLabelValues(reasonCooldown, pod.Spec.NodeName, pod.Namespace).Inc() + effectiveCooldownSeconds.Observe(effectiveCooldown.Seconds()) + return false + } + + return true +} + +// getVMI looks up the VMI for a virt-launcher pod from the informer cache. +// Returns (nil, false) for non-virt-launcher pods and on any lookup error +// (fail open: the pod is not blocked from eviction). +func (k *KubevirtMigrationAware) getVMI(pod *v1.Pod) (*unstructured.Unstructured, bool) { + vmiName, ok := pod.Annotations[vmiAnnotationKey] + if !ok { + return nil, false + } + + rObj, err := k.vmiLister.ByNamespace(pod.Namespace).Get(vmiName) + if err != nil { + k.logger.V(4).Info("VMI not found in cache, allowing eviction", + "pod", klog.KObj(pod), "vmi", vmiName, "err", err) + return nil, false + } + + uObj, ok := rObj.(*unstructured.Unstructured) + if !ok { + k.logger.V(4).Info("Unexpected VMI object type, allowing eviction", + "pod", klog.KObj(pod), "vmi", vmiName, "type", fmt.Sprintf("%T", rObj)) + return nil, false + } + + return uObj, true +} + +// migrationInProgress returns true when the VMI has a migration that has +// started but not yet finished (startTimestamp present, endTimestamp absent). +func migrationInProgress(uObj *unstructured.Unstructured) bool { + startTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "startTimestamp") + if !found || startTS == "" { + return false + } + endTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "endTimestamp") + return !found || endTS == "" +} + +// migrationStartTime returns the time at which the last migration started. +// Returns (nil, false) when no startTimestamp is recorded or it is malformed. +func migrationStartTime(uObj *unstructured.Unstructured) (*time.Time, bool) { + startTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "startTimestamp") + if !found || startTS == "" { + return nil, false + } + t, err := time.Parse(time.RFC3339, startTS) + if err != nil { + return nil, false + } + return &t, true +} + +// migrationEndTime returns the time at which the last migration completed. +// Returns (nil, false) when there is no completed migration record. +func migrationEndTime(uObj *unstructured.Unstructured) (*time.Time, bool) { + endTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "endTimestamp") + if !found || endTS == "" { + return nil, false + } + t, err := time.Parse(time.RFC3339, endTS) + if err != nil { + return nil, false + } + return &t, true +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go new file mode 100644 index 0000000000..f07b1f6f36 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go @@ -0,0 +1,592 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "context" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" +) + +// makeVMI builds an unstructured VMI object with optional migrationState. +// migrationState may be nil (no migration ever ran), a map with only +// "startTimestamp" (in progress), or a map with both timestamps (completed). +// No kubevirt imports are needed: the object is just a plain nested map. +func makeVMI(namespace, name string, migrationState map[string]interface{}) *unstructured.Unstructured { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "kubevirt.io/v1", + "kind": "VirtualMachineInstance", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + }, + } + if migrationState != nil { + obj.Object["status"] = map[string]interface{}{ + "migrationState": migrationState, + } + } + return obj +} + +// inProgressState returns a migrationState map representing an ongoing migration. +func inProgressState(start time.Time) map[string]interface{} { + return map[string]interface{}{ + "startTimestamp": start.UTC().Format(time.RFC3339), + } +} + +// completedState returns a migrationState map representing a finished migration. +func completedState(start, end time.Time) map[string]interface{} { + return map[string]interface{}{ + "startTimestamp": start.UTC().Format(time.RFC3339), + "endTimestamp": end.UTC().Format(time.RFC3339), + } +} + +// makeVirtLauncherPod returns a pod that carries the kubevirt.io/domain annotation +// linking it to a VMI, as a real virt-launcher pod would. +func makeVirtLauncherPod(namespace, name, nodeName, vmiName string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Annotations: map[string]string{ + vmiAnnotationKey: vmiName, + }, + }, + Spec: v1.PodSpec{NodeName: nodeName}, + } +} + +// makePlainPod returns a pod with no kubevirt annotation (e.g. a regular workload). +func makePlainPod(namespace, name, nodeName string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + Spec: v1.PodSpec{NodeName: nodeName}, + } +} + +// makeVMILister builds a cache.GenericLister pre-populated with the given VMIs. +// It mirrors exactly what the production dynamic informer would serve, without +// any dynamic client or network calls. +func makeVMILister(vmis ...*unstructured.Unstructured) cache.GenericLister { + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + cache.NamespaceIndex: cache.MetaNamespaceIndexFunc, + }) + for _, vmi := range vmis { + _ = indexer.Add(vmi) + } + return cache.NewGenericLister(indexer, vmiGVR.GroupResource()) +} + +// newTestPlugin is a convenience wrapper that calls the internal constructor +// with a fake lister. Pass maxCooldown=0 to disable the adaptive cap. +func newTestPlugin(t *testing.T, cooldown, maxCooldown time.Duration, vmis ...*unstructured.Unstructured) *KubevirtMigrationAware { + t.Helper() + args := &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: cooldown}, + MaxMigrationCooldown: metav1.Duration{Duration: maxCooldown}, + MigrationHistoryWindow: metav1.Duration{Duration: defaultMigrationHistoryWindow}, + } + pg, err := newPlugin(context.Background(), args, nil, makeVMILister(vmis...), newMigrationHistory()) + if err != nil { + t.Fatalf("newPlugin: %v", err) + } + return pg.(*KubevirtMigrationAware) +} + +// ── Filter ──────────────────────────────────────────────────────────────────── + +func TestFilter(t *testing.T) { + const ns = "default" + now := time.Now() + + cases := []struct { + description string + vmis []*unstructured.Unstructured + pod *v1.Pod + wantAllow bool + }{ + { + description: "non-virt-launcher pod (no annotation) is always allowed", + pod: makePlainPod(ns, "plain-pod", "node-1"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI is absent from cache is allowed (fail open)", + pod: makeVirtLauncherPod(ns, "virt-launcher-a", "node-1", "vm-a"), + // no VMIs added to the lister + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI has no migration history is allowed", + vmis: []*unstructured.Unstructured{makeVMI(ns, "vm-b", nil)}, + pod: makeVirtLauncherPod(ns, "virt-launcher-b", "node-1", "vm-b"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI has a completed migration is allowed", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-c", completedState(now.Add(-10*time.Minute), now.Add(-5*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-c", "node-2", "vm-c"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI migration is in progress is blocked", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-d", inProgressState(now.Add(-2*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-d", "node-1", "vm-d"), + wantAllow: false, + }, + { + description: "pod in different namespace from VMI cache entry is allowed (cache miss)", + vmis: []*unstructured.Unstructured{ + makeVMI("other-ns", "vm-e", inProgressState(now.Add(-1*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-e", "node-1", "vm-e"), + wantAllow: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, 5*time.Minute, 0, tc.vmis...) + got := plugin.Filter(tc.pod) + if got != tc.wantAllow { + t.Errorf("Filter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── PreEvictionFilter ───────────────────────────────────────────────────────── + +func TestPreEvictionFilter(t *testing.T) { + const ( + ns = "default" + cooldown = 5 * time.Minute + ) + now := time.Now() + + cases := []struct { + description string + vmis []*unstructured.Unstructured + pod *v1.Pod + wantAllow bool + }{ + { + description: "non-virt-launcher pod is always allowed", + pod: makePlainPod(ns, "plain-pod", "node-1"), + wantAllow: true, + }, + { + description: "VMI absent from cache is allowed (fail open)", + pod: makeVirtLauncherPod(ns, "virt-launcher-a", "node-1", "vm-a"), + wantAllow: true, + }, + { + description: "VMI with no migration history is allowed", + vmis: []*unstructured.Unstructured{makeVMI(ns, "vm-b", nil)}, + pod: makeVirtLauncherPod(ns, "virt-launcher-b", "node-1", "vm-b"), + wantAllow: true, + }, + { + description: "VMI whose migration ended just within the cooldown window is deferred", + vmis: []*unstructured.Unstructured{ + // ended 1 minute ago; cooldown is 5 minutes → still blocked + makeVMI(ns, "vm-c", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-c", "node-2", "vm-c"), + wantAllow: false, + }, + { + description: "VMI whose migration ended exactly at the cooldown boundary is allowed", + vmis: []*unstructured.Unstructured{ + // ended 5 minutes + 1 second ago → just past the cooldown + makeVMI(ns, "vm-d", completedState(now.Add(-10*time.Minute), now.Add(-(cooldown+time.Second)))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-d", "node-2", "vm-d"), + wantAllow: true, + }, + { + description: "VMI whose migration ended well before the cooldown window is allowed", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-e", completedState(now.Add(-30*time.Minute), now.Add(-20*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-e", "node-3", "vm-e"), + wantAllow: true, + }, + { + description: "VMI mid-migration has no endTimestamp so is allowed by PreEvictionFilter (Filter handles this)", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-f", inProgressState(now.Add(-2*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-f", "node-1", "vm-f"), + wantAllow: true, + }, + { + description: "malformed endTimestamp is treated as no timestamp (fail open)", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-g", map[string]interface{}{ + "startTimestamp": now.Add(-10 * time.Minute).UTC().Format(time.RFC3339), + "endTimestamp": "not-a-valid-timestamp", + }), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-g", "node-1", "vm-g"), + wantAllow: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, cooldown, 0, tc.vmis...) + got := plugin.PreEvictionFilter(tc.pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── Cooldown duration is respected ─────────────────────────────────────────── + +func TestPreEvictionFilterRespectsConfiguredCooldown(t *testing.T) { + const ns = "default" + now := time.Now() + endedAgo := 3 * time.Minute + + // A 1-minute migration ensures the configured cooldown always dominates + // (migration duration < any cooldown under test). + vmi := makeVMI(ns, "vm-1", completedState(now.Add(-(endedAgo+time.Minute)), now.Add(-endedAgo))) + pod := makeVirtLauncherPod(ns, "virt-launcher-1", "node-1", "vm-1") + + // With a 5-minute cooldown the VM (ended 3m ago) should be deferred. + t.Run("5m cooldown blocks VM ended 3m ago", func(t *testing.T) { + plugin := newTestPlugin(t, 5*time.Minute, 0, vmi) + if plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = true (allowed), want false (deferred)") + } + }) + + // With a 2-minute cooldown the same VM should be evictable (elapsed 3m > effective 2m). + t.Run("2m cooldown allows VM ended 3m ago", func(t *testing.T) { + plugin := newTestPlugin(t, 2*time.Minute, 0, vmi) + if !plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = false (deferred), want true (allowed)") + } + }) + + // With zero cooldown the adaptive cooldown equals the migration duration (1m); + // elapsed 3m > 1m so the VM is immediately evictable. + t.Run("zero cooldown allows VM ended 3m ago (1m migration)", func(t *testing.T) { + plugin := newTestPlugin(t, 0, 0, vmi) + if !plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = false (deferred), want true (allowed)") + } + }) +} + +// ── Adaptive per-VM cooldown ────────────────────────────────────────────────── + +func TestPreEvictionFilterAdaptiveCooldown(t *testing.T) { + const ( + ns = "default" + configured = 15 * time.Minute + ) + now := time.Now() + + cases := []struct { + description string + migStart time.Duration // relative to now + migEnd time.Duration // relative to now + maxCooldown time.Duration // 0 = disabled + wantAllow bool + }{ + { + // Small VM: 2-minute migration — configured 15m dominates. + // Ended 16m ago → elapsed(16m) > effective(15m) → allowed. + description: "small VM: configured cooldown dominates, elapsed past it", + migStart: -20 * time.Minute, + migEnd: -18 * time.Minute, // duration = 2m + wantAllow: true, + }, + { + // Small VM: same 2-minute migration, but ended only 10m ago. + // effective = max(15m, 2m) = 15m; elapsed(10m) < 15m → blocked. + description: "small VM: configured cooldown dominates, still within window", + migStart: -25 * time.Minute, + migEnd: -10 * time.Minute, // duration = 15m; elapsed = 10m + wantAllow: false, + }, + { + // Large VM: 30-minute migration — duration dominates over 15m. + // Ended 5m ago → elapsed(5m) < effective(30m) → blocked. + description: "large VM: migration duration dominates, still within window", + migStart: -35 * time.Minute, + migEnd: -5 * time.Minute, // duration = 30m + wantAllow: false, + }, + { + // Large VM: 30-minute migration, ended 31m ago. + // effective = 30m; elapsed(31m) > 30m → allowed. + description: "large VM: migration duration dominates, elapsed past it", + migStart: -61 * time.Minute, + migEnd: -31 * time.Minute, // duration = 30m + wantAllow: true, + }, + { + // Large VM with cap: 30-minute migration capped at 20m. + // effective = min(max(15m, 30m), 20m) = 20m; ended 5m ago → blocked. + description: "large VM: cap applied, still within capped window", + migStart: -35 * time.Minute, + migEnd: -5 * time.Minute, // duration = 30m; cap = 20m + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + { + // Large VM with cap: same migration, but ended 21m ago. + // effective = 20m (capped); elapsed(21m) > 20m → allowed. + description: "large VM: cap applied, elapsed past capped window", + migStart: -56 * time.Minute, + migEnd: -21 * time.Minute, // duration = 35m; cap = 20m + maxCooldown: 20 * time.Minute, + wantAllow: true, + }, + { + // No startTimestamp: adaptive path skipped, falls back to configured cooldown. + // effective = 15m; ended 10m ago → blocked. + description: "missing startTimestamp: falls back to configured cooldown", + migStart: 0, // sentinel: will be omitted from migrationState + migEnd: -10 * time.Minute, + wantAllow: false, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + var state map[string]interface{} + if tc.migStart == 0 { + // Only endTimestamp, no startTimestamp. + state = map[string]interface{}{ + "endTimestamp": now.Add(tc.migEnd).UTC().Format(time.RFC3339), + } + } else { + state = completedState(now.Add(tc.migStart), now.Add(tc.migEnd)) + } + vmi := makeVMI(ns, "vm-adaptive", state) + pod := makeVirtLauncherPod(ns, "virt-launcher-adaptive", "node-1", "vm-adaptive") + + plugin := newTestPlugin(t, configured, tc.maxCooldown, vmi) + got := plugin.PreEvictionFilter(pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── Exponential backoff from migration frequency ────────────────────────────── + +func TestPreEvictionFilterExponentialBackoff(t *testing.T) { + const ( + ns = "default" + cooldown = 15 * time.Minute + vmiUID = types.UID("uid-vm-backoff") + ) + now := time.Now() + + // A 10-second migration that ended 5 minutes ago. + // Base effective cooldown = max(15m, 10s) = 15m. elapsed = 5m < 15m. + vmi := makeVMI(ns, "vm-backoff", completedState( + now.Add(-5*time.Minute-10*time.Second), + now.Add(-5*time.Minute), + )) + vmi.SetUID(vmiUID) + pod := makeVirtLauncherPod(ns, "virt-launcher-backoff", "node-1", "vm-backoff") + + cases := []struct { + description string + historyCount int // entries to pre-populate (spread over last few hours) + maxCooldown time.Duration // 0 = disabled + wantAllow bool + }{ + { + // count=0: race — current migration not yet in history; base 15m applies. + description: "count=0 (race): base cooldown, blocked", + historyCount: 0, + wantAllow: false, // 15m * 2^0 = 15m; elapsed 5m < 15m + }, + { + // count=1: one entry (current migration recorded); 2^0 doublings = 15m. + description: "count=1: first migration, base cooldown, blocked", + historyCount: 1, + wantAllow: false, // 15m * 2^0 = 15m; elapsed 5m < 15m + }, + { + // count=2: one prior + current; 2^1 doublings → 30m. + description: "count=2: one prior migration doubles cooldown to 30m, blocked", + historyCount: 2, + wantAllow: false, // 15m * 2^1 = 30m; elapsed 5m < 30m + }, + { + // count=3: two prior; 2^2 doublings → 60m. + description: "count=3: two prior migrations, cooldown 60m, blocked", + historyCount: 3, + wantAllow: false, // 15m * 2^2 = 60m; elapsed 5m < 60m + }, + { + // count=2 with max=20m: min(30m, 20m) = 20m; elapsed 5m < 20m → blocked. + description: "count=2 with max cap at 20m: capped, still blocked", + historyCount: 2, + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + { + // count=4 with max=20m: min(120m, 20m) = 20m; elapsed 5m < 20m → blocked. + description: "count=4 with max cap at 20m: cap bounds growth, blocked", + historyCount: 4, + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, cooldown, tc.maxCooldown, vmi) + // Pre-populate history: spread entries so they all fall within the 6h window. + for i := 0; i < tc.historyCount; i++ { + plugin.history.record(vmiUID, now.Add(-time.Duration(i+1)*30*time.Minute)) + } + got := plugin.PreEvictionFilter(pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── migrationHistory unit tests ─────────────────────────────────────────────── + +func TestMigrationHistory(t *testing.T) { + const ( + uid = types.UID("uid-test") + window = 24 * time.Hour + ) + now := time.Now() + + t.Run("empty history returns zero", func(t *testing.T) { + h := newMigrationHistory() + if got := h.countAndPrune(uid, window); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + }) + + t.Run("entries within window are counted", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-1*time.Hour)) + h.record(uid, now.Add(-2*time.Hour)) + if got := h.countAndPrune(uid, window); got != 2 { + t.Errorf("countAndPrune() = %d, want 2", got) + } + }) + + t.Run("entries outside the window are pruned", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-25*time.Hour)) // outside 24h window + h.record(uid, now.Add(-1*time.Hour)) // inside + if got := h.countAndPrune(uid, window); got != 1 { + t.Errorf("countAndPrune() = %d, want 1 (stale entry pruned)", got) + } + }) + + t.Run("all entries expired: map entry is deleted", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-25*time.Hour)) + if got := h.countAndPrune(uid, window); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + h.mu.Lock() + _, exists := h.completions[uid] + h.mu.Unlock() + if exists { + t.Error("map entry was not deleted after all entries expired") + } + }) +} + +func TestMigrationHistoryOnVMIUpdate(t *testing.T) { + const ( + ns = "default" + uid = types.UID("uid-update-test") + ) + now := time.Now() + + withUID := func(vmi *unstructured.Unstructured) *unstructured.Unstructured { + vmi.SetUID(uid) + return vmi + } + + t.Run("migration completion is recorded", func(t *testing.T) { + h := newMigrationHistory() + old := withUID(makeVMI(ns, "vmi", inProgressState(now.Add(-10*time.Minute)))) + new := withUID(makeVMI(ns, "vmi", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute)))) + h.onVMIUpdate(old, new) + if got := h.countAndPrune(uid, 24*time.Hour); got != 1 { + t.Errorf("countAndPrune() = %d, want 1", got) + } + }) + + t.Run("update with unchanged endTimestamp is not re-recorded", func(t *testing.T) { + h := newMigrationHistory() + vmi := withUID(makeVMI(ns, "vmi", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute)))) + h.onVMIUpdate(vmi, vmi) // same object, endTimestamp unchanged + if got := h.countAndPrune(uid, 24*time.Hour); got != 0 { + t.Errorf("countAndPrune() = %d, want 0 (no transition)", got) + } + }) + + t.Run("update with no migration state is ignored", func(t *testing.T) { + h := newMigrationHistory() + old := withUID(makeVMI(ns, "vmi", nil)) + new := withUID(makeVMI(ns, "vmi", nil)) + h.onVMIUpdate(old, new) + if got := h.countAndPrune(uid, 24*time.Hour); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + }) + + t.Run("second migration completion increments count", func(t *testing.T) { + h := newMigrationHistory() + first := withUID(makeVMI(ns, "vmi", completedState(now.Add(-3*time.Hour), now.Add(-2*time.Hour)))) + second := withUID(makeVMI(ns, "vmi", completedState(now.Add(-30*time.Minute), now.Add(-5*time.Minute)))) + h.onVMIUpdate(first, second) + if got := h.countAndPrune(uid, 24*time.Hour); got != 1 { + t.Errorf("countAndPrune() = %d, want 1", got) + } + }) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/register.go b/pkg/framework/plugins/kubevirtmigrationaware/register.go new file mode 100644 index 0000000000..0db38920e2 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/register.go @@ -0,0 +1,31 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "k8s.io/apimachinery/pkg/runtime" +) + +var ( + SchemeBuilder = runtime.NewSchemeBuilder() + localSchemeBuilder = &SchemeBuilder + AddToScheme = localSchemeBuilder.AddToScheme +) + +func init() { + localSchemeBuilder.Register(addDefaultingFuncs) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/types.go b/pkg/framework/plugins/kubevirtmigrationaware/types.go new file mode 100644 index 0000000000..c5cfd6cd62 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/types.go @@ -0,0 +1,49 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +k8s:deepcopy-gen=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// KubevirtMigrationAwareArgs holds arguments used to configure the +// KubevirtMigrationAware plugin. +type KubevirtMigrationAwareArgs struct { + metav1.TypeMeta `json:",inline"` + + // MigrationCooldown is the minimum duration that must elapse after a VM + // live-migration completes before the descheduler may evict the virt-launcher + // pod again. The effective per-VM cooldown is max(MigrationCooldown, + // migration duration), so heavier VMs automatically receive longer protection. + // Defaults to 15m. + MigrationCooldown metav1.Duration `json:"migrationCooldown,omitempty"` + + // MaxMigrationCooldown caps the adaptive per-VM cooldown computed as + // max(MigrationCooldown, migration duration) after exponential backoff is + // applied. Use this to prevent pathological cases (very slow migrations or + // heavy churn) from locking a VM indefinitely. Defaults to 6h. + MaxMigrationCooldown metav1.Duration `json:"maxMigrationCooldown,omitempty"` + + // MigrationHistoryWindow is the sliding window over which past migration + // completions are counted for exponential-backoff purposes. Longer windows + // make the plugin sensitive to day-scale churn; shorter windows let VMs + // recover their clean record faster. Defaults to 24h. + MigrationHistoryWindow metav1.Duration `json:"migrationHistoryWindow,omitempty"` +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/validation.go b/pkg/framework/plugins/kubevirtmigrationaware/validation.go new file mode 100644 index 0000000000..cd868be98d --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/validation.go @@ -0,0 +1,41 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/runtime" +) + +func ValidateKubevirtMigrationAwareArgs(obj runtime.Object) error { + args := obj.(*KubevirtMigrationAwareArgs) + if args.MigrationCooldown.Duration < 0 { + return fmt.Errorf("migrationCooldown must be non-negative, got %v", args.MigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration < 0 { + return fmt.Errorf("maxMigrationCooldown must be non-negative, got %v", args.MaxMigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration > 0 && args.MaxMigrationCooldown.Duration < args.MigrationCooldown.Duration { + return fmt.Errorf("maxMigrationCooldown (%v) must be >= migrationCooldown (%v)", + args.MaxMigrationCooldown.Duration, args.MigrationCooldown.Duration) + } + if args.MigrationHistoryWindow.Duration < 0 { + return fmt.Errorf("migrationHistoryWindow must be non-negative, got %v", args.MigrationHistoryWindow.Duration) + } + return nil +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go b/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go new file mode 100644 index 0000000000..65a14ef0f9 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go @@ -0,0 +1,114 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestValidateArgs(t *testing.T) { + cases := []struct { + description string + args *KubevirtMigrationAwareArgs + wantErr bool + }{ + { + description: "zero cooldown is valid (disables the cooldown gate)", + args: &KubevirtMigrationAwareArgs{}, + wantErr: false, + }, + { + description: "positive cooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 5 * time.Minute}, + }, + wantErr: false, + }, + { + description: "negative cooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: -1 * time.Second}, + }, + wantErr: true, + }, + { + description: "negative maxMigrationCooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MaxMigrationCooldown: metav1.Duration{Duration: -1 * time.Second}, + }, + wantErr: true, + }, + { + description: "maxMigrationCooldown below migrationCooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 10 * time.Minute}, + }, + wantErr: true, + }, + { + description: "maxMigrationCooldown equal to migrationCooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + }, + wantErr: false, + }, + { + description: "maxMigrationCooldown greater than migrationCooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 1 * time.Hour}, + }, + wantErr: false, + }, + { + description: "zero maxMigrationCooldown (disabled) is always valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 0}, + }, + wantErr: false, + }, + { + description: "positive migrationHistoryWindow is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationHistoryWindow: metav1.Duration{Duration: 24 * time.Hour}, + }, + wantErr: false, + }, + { + description: "negative migrationHistoryWindow is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationHistoryWindow: metav1.Duration{Duration: -1 * time.Hour}, + }, + wantErr: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + err := ValidateKubevirtMigrationAwareArgs(tc.args) + if (err != nil) != tc.wantErr { + t.Errorf("ValidateKubevirtMigrationAwareArgs() error = %v, wantErr = %v", err, tc.wantErr) + } + }) + } +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go new file mode 100644 index 0000000000..98893d0563 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go @@ -0,0 +1,54 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package kubevirtmigrationaware + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KubevirtMigrationAwareArgs) DeepCopyInto(out *KubevirtMigrationAwareArgs) { + *out = *in + out.TypeMeta = in.TypeMeta + out.MigrationCooldown = in.MigrationCooldown + out.MaxMigrationCooldown = in.MaxMigrationCooldown + out.MigrationHistoryWindow = in.MigrationHistoryWindow + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubevirtMigrationAwareArgs. +func (in *KubevirtMigrationAwareArgs) DeepCopy() *KubevirtMigrationAwareArgs { + if in == nil { + return nil + } + out := new(KubevirtMigrationAwareArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *KubevirtMigrationAwareArgs) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go new file mode 100644 index 0000000000..8f5b3c63b0 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go @@ -0,0 +1,33 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by defaulter-gen. DO NOT EDIT. + +package kubevirtmigrationaware + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + return nil +}