diff --git a/pkg/descheduler/setupplugins.go b/pkg/descheduler/setupplugins.go index eff3d049ae..59c67f5b10 100644 --- a/pkg/descheduler/setupplugins.go +++ b/pkg/descheduler/setupplugins.go @@ -19,6 +19,7 @@ package descheduler import ( "sigs.k8s.io/descheduler/pkg/framework/pluginregistry" "sigs.k8s.io/descheduler/pkg/framework/plugins/defaultevictor" + "sigs.k8s.io/descheduler/pkg/framework/plugins/kubevirtmigrationaware" "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization" "sigs.k8s.io/descheduler/pkg/framework/plugins/podlifetime" "sigs.k8s.io/descheduler/pkg/framework/plugins/removeduplicates" @@ -47,4 +48,5 @@ func RegisterDefaultPlugins(registry pluginregistry.Registry) { pluginregistry.Register(removepodsviolatingnodeaffinity.PluginName, removepodsviolatingnodeaffinity.New, &removepodsviolatingnodeaffinity.RemovePodsViolatingNodeAffinity{}, &removepodsviolatingnodeaffinity.RemovePodsViolatingNodeAffinityArgs{}, removepodsviolatingnodeaffinity.ValidateRemovePodsViolatingNodeAffinityArgs, removepodsviolatingnodeaffinity.SetDefaults_RemovePodsViolatingNodeAffinityArgs, registry) pluginregistry.Register(removepodsviolatingnodetaints.PluginName, removepodsviolatingnodetaints.New, &removepodsviolatingnodetaints.RemovePodsViolatingNodeTaints{}, &removepodsviolatingnodetaints.RemovePodsViolatingNodeTaintsArgs{}, removepodsviolatingnodetaints.ValidateRemovePodsViolatingNodeTaintsArgs, removepodsviolatingnodetaints.SetDefaults_RemovePodsViolatingNodeTaintsArgs, registry) pluginregistry.Register(removepodsviolatingtopologyspreadconstraint.PluginName, removepodsviolatingtopologyspreadconstraint.New, &removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraint{}, &removepodsviolatingtopologyspreadconstraint.RemovePodsViolatingTopologySpreadConstraintArgs{}, removepodsviolatingtopologyspreadconstraint.ValidateRemovePodsViolatingTopologySpreadConstraintArgs, removepodsviolatingtopologyspreadconstraint.SetDefaults_RemovePodsViolatingTopologySpreadConstraintArgs, registry) + pluginregistry.Register(kubevirtmigrationaware.PluginName, kubevirtmigrationaware.New, &kubevirtmigrationaware.KubevirtMigrationAware{}, &kubevirtmigrationaware.KubevirtMigrationAwareArgs{}, kubevirtmigrationaware.ValidateKubevirtMigrationAwareArgs, kubevirtmigrationaware.SetDefaults_KubevirtMigrationAwareArgs, registry) } diff --git a/pkg/framework/plugins/kubevirtmigrationaware/README.md b/pkg/framework/plugins/kubevirtmigrationaware/README.md new file mode 100644 index 0000000000..60e785a306 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/README.md @@ -0,0 +1,535 @@ +# KubevirtMigrationAware + +An `EvictorPlugin` that makes the descheduler aware of KubeVirt live-migration +state when deciding whether to evict `virt-launcher` pods. It prevents +evictions during active migrations and applies a self-tuning cooldown after +migrations complete, reducing per-VM churn without modifying any other part of +the descheduler. + +--- + +## 1. Problem + +### 1.1 The descheduler rebalances by evicting pods — but VMs are not pods + +The descheduler measures node utilisation, identifies outlier nodes, and evicts +pods from overloaded nodes so that the scheduler can place them somewhere +better. For regular stateless workloads this is harmless: the pod restarts +quickly on a new node and the cluster converges. + +For KubeVirt virtual machines the same eviction triggers a **live migration**. +The `virt-launcher` pod is evicted, KubeVirt moves the VM's memory and CPU +state to a destination node, and a new `virt-launcher` pod appears there. The +migration itself consumes CPU, memory bandwidth, and network capacity on both +source and destination nodes — resources that are visible to the very metrics +the descheduler uses to decide its next move. + +### 1.2 Per-VM churn: the same VM migrated repeatedly + +Without any awareness of migration state, the descheduler can evict the same +VM multiple times in quick succession: + +1. VM is on node A (overloaded) → descheduler evicts it → migration starts. +2. Migration completes; VM is now on node B. +3. Node B's utilisation rises because the VM just landed there and is warming + up its CPU caches and memory working set. +4. Node B now looks like an outlier → descheduler evicts the VM again. +5. Repeat. + +This **churn loop** produces more migrations than the cluster needs, degrades +VM performance, and can prevent the cluster from ever reaching a stable state. + +### 1.3 Non-convergence at cluster scale + +A subtler and harder problem arises when the descheduler's outlier detection +uses cluster-relative thresholds. If all nodes run at sustained moderate load, +comparing nodes against each other always produces outliers — the cluster looks +imbalanced even when it is as balanced as the workload allows. + +Under these conditions the descheduler keeps evicting. Measurements on a +300-VM cluster running a sustained CPU/RAM stress profile showed roughly +**one eviction per VM per day** even after the cluster was already +post-rebalance. Raising the outlier margin from 10% to 20% cuts the volume +significantly; this plugin further reduces harm by rate-limiting how often any +individual VM can be evicted — but it does not change the fundamental +convergence property of the outlier algorithm. + +**Important:** this plugin addresses *per-VM churn* (the same VM migrated +repeatedly). It does not reduce *total migration volume*: if 50 VMs are in +cooldown, the descheduler will target the other 50. For total-volume +reduction, raise the outlier threshold in your descheduler profile. + +--- + +## 2. Architecture + +### 2.1 The descheduler is VM-blind by default + +The descheduler only watches `Pod` objects. It has no built-in concept of +VirtualMachineInstances (VMIs), migration state, or whether an eviction will +trigger a live migration or a simple pod restart. + +To protect VMs we need the descheduler to consult KubeVirt's VMI status before +evicting. This plugin provides that bridge. + +### 2.2 A dedicated VMI informer — no API-server calls in the hot path + +The plugin creates its own `DynamicSharedInformerFactory` that watches +`virtualmachineinstances.kubevirt.io/v1` across all namespaces. VMI objects +are stored in a local in-memory cache (a standard `client-go` informer store). + +Every call to `Filter` or `PreEvictionFilter` reads from this cache — no +API-server round-trip in the eviction hot path. The cache warms up at plugin +startup with a 30-second timeout; if the VMI informer cannot sync (e.g. +KubeVirt is not installed), the plugin fails fast and prevents the descheduler +from starting. + +The plugin also registers an `UpdateFunc` event handler on the same informer. +When a VMI's `status.migrationState.endTimestamp` transitions to a new +non-empty value — meaning a migration just completed — the handler records the +event in an in-memory history map keyed by VMI UID. This history drives the +exponential backoff described in §3.2. + +The link between a `virt-launcher` pod and its VMI is the annotation +`kubevirt.io/domain` that KubeVirt sets on every `virt-launcher` pod. Its +value is the VMI name. Pods without this annotation are not `virt-launcher` +pods and pass through both extension points unchanged. + +### 2.3 Two extension points: hard block vs. soft defer + +The descheduler's evictor pipeline exposes two distinct hooks, and the plugin +uses both for different purposes. + +**`Filter` — hard block.** Called during candidate selection. Returning +`false` removes the pod from the eviction candidate set entirely for this +descheduler cycle. The plugin uses this to block eviction of any +`virt-launcher` pod whose VMI has a migration actively in progress +(`startTimestamp` present, `endTimestamp` absent in `migrationState`). + +KubeVirt's `virt-api` already provides a complementary safety net: a +validating admission webhook that intercepts eviction requests and rejects +them when a migration is already in progress for that VM. Our `Filter` +acts upstream of that — it prevents the eviction attempt from being issued +at all, avoiding the API round-trip. In a distributed environment where +concurrent control loops may race, KubeVirt's webhook remains the +authoritative last line of defence; this plugin is defence-in-depth, not a +replacement. + +**`PreEvictionFilter` — soft defer.** Called immediately before each +individual eviction is issued. Returning `false` skips this pod and lets the +eviction loop try the next candidate on the same node. The plugin uses this to +apply the cooldown logic described in §3: if the VM migrated recently and the +cooldown has not expired, the eviction is deferred rather than hard-blocked, +giving other pods on the same node a chance to be evicted instead. + +The practical difference: `Filter` stops a pod from being a candidate at all; +`PreEvictionFilter` lets the loop skip a specific pod and try others. Both +must be enabled in the descheduler profile (see §6). + +--- + +## 3. Cooldown Logic + +### Two protection tiers with different durability + +Before describing the individual layers it is important to understand that the +cooldown logic has two fundamentally different durability tiers: + +**Tier A — VMI-persisted (survives descheduler pod restarts).** +Layer 1 reads `startTimestamp` and `endTimestamp` directly from +`status.migrationState` on the VMI object. KubeVirt writes and owns this +field; it persists on the VMI regardless of what happens to the descheduler +pod. A rolling update, an OOM kill, or a node eviction of the descheduler pod +does not erase it. Critically, this tier also captures the *cost* of the +migration: the difference between the two timestamps tells the plugin how long +that VM took to migrate, which directly raises the cooldown for expensive VMs. + +**Tier B — in-memory only (lost on descheduler pod restart).** +Layer 2 keeps a sliding-window history of migration-completion events in the +plugin's process memory. This drives the exponential backoff: a VM that +migrates repeatedly within the window gets a progressively longer cooldown. +The history is populated by informer events at runtime and is not persisted +anywhere. If the descheduler pod restarts the history is reset, and VMs that +had accumulated backoff appear clean again — see §8 for the operational +implications. + +The two tiers complement each other: Tier A provides a baseline guarantee that +is always present and restart-safe; Tier B adds stronger churn resistance for +VMs that are actively churning and its absence after a restart is typically +short-lived because the in-memory history rebuilds as migrations continue. + +--- + +The cooldown is computed in three layers applied in order. Each layer can only +increase the effective cooldown; none can reduce it below the previous layer's +result. + +### 3.1 Layer 1 — base adaptive cooldown + +``` +effectiveCooldown = max(migrationCooldown, migrationDuration) +``` + +`migrationCooldown` is the operator-configured minimum (default **15 minutes**). +`migrationDuration` is `endTimestamp − startTimestamp` read from the VMI's +`status.migrationState`. + +The `max` means that heavier VMs — ones that take a long time to migrate +because they have large memory footprints or high dirty-page rates — receive +proportionally longer protection automatically, without any manual per-VM +configuration. + +**Examples with default `migrationCooldown = 15m`:** + +| VM type | Migration duration | Effective cooldown (layer 1) | +|---|---|---| +| Small idle VM | 30 s | 15 m (configured floor dominates) | +| Medium VM | 10 m | 15 m (configured floor dominates) | +| Large memory VM | 25 m | 25 m (duration dominates) | +| Monster VM | 90 m | 90 m (duration dominates) | + +If `startTimestamp` is absent or malformed the migration duration cannot be +computed; the plugin falls back to `migrationCooldown` alone. + +### 3.2 Layer 2 — exponential backoff from migration history + +``` +effectiveCooldown = layer1Result × 2^(count − 1) (for count ≥ 1) +``` + +`count` is the number of migration completions recorded for this VMI in the +`migrationHistoryWindow` (default **24 hours**). The history is populated by +the informer event handler (§2.2), which fires each time a VMI's +`endTimestamp` changes — capturing every migration the cluster runs for that +VMI, not just descheduler-caused ones. + +The effect is that each successive migration within the window doubles the +cooldown, making it progressively harder to evict a VM that keeps getting +churned: + +**Examples with default `migrationCooldown = 15m`, small VM (duration < 15m):** + +| Migrations in last 24h | Effective cooldown (layer 2) | +|---:|---| +| 0 or 1 | 15 m | +| 2 | 30 m | +| 3 | 1 h | +| 4 | 2 h | +| 5 | 4 h | +| 6+ | capped by layer 3 | + +A VM that migrates once or twice recovers its normal cooldown naturally as old +history entries age out of the 24-hour window. A VM that migrates 5+ times in +a day is almost certainly in a churn loop; it gets progressively longer +protection until the loop breaks. + +> **Note on the race condition:** the informer event handler fires +> asynchronously. Between the moment a migration completes and the moment the +> descheduler's next cycle calls `PreEvictionFilter`, the event handler may or +> may not have recorded the completion yet. In the worst case `count` is +> under-reported by 1, meaning the backoff multiplier is 1× lower than +> expected for a single cycle. This is intentional and acceptable: the correct +> value is applied in the next cycle. + +### 3.3 Layer 3 — maximum cooldown cap + +``` +effectiveCooldown = min(layer2Result, maxMigrationCooldown) +``` + +`maxMigrationCooldown` (default **6 hours**) bounds the growth from both the +adaptive duration (layer 1) and the exponential backoff (layer 2). + +Without this cap, a VM with 8 migrations in 24 hours on a 15-minute base +cooldown would reach `15m × 2^7 = 32 h`, locking the VM for longer than the +history window itself. The cap ensures the descheduler always has an +opportunity to re-evaluate after at most 6 hours, regardless of how severe the +churn history is. + +To disable the cap, set `maxMigrationCooldown: 0`. This is not recommended +with a long `migrationHistoryWindow` because backoff can grow unbounded. + +**Full worked example — large VM in a churn loop:** + +Assume: `migrationCooldown: 15m`, `maxMigrationCooldown: 6h`, +`migrationHistoryWindow: 24h`. The VM has a 30-minute migration duration. + +| Event | Time | count in 24h window | Layer 1 | Layer 2 | Layer 3 (cap) | +|---|---|---:|---|---|---| +| 1st migration completes | T+0 | 1 | 30 m | 30 m | 30 m | +| 2nd migration completes | T+31m | 2 | 30 m | 60 m | 60 m | +| 3rd migration completes | T+2h | 3 | 30 m | 2 h | 2 h | +| 4th migration completes | T+5h | 4 | 30 m | 4 h | 4 h | +| 5th migration completes | T+10h | 5 | 30 m | 8 h | **6 h** (capped) | +| 1st entry ages out at T+24h | T+25h | 4 | 30 m | 4 h | 4 h | + +The VM steps back down gradually as history entries age out — it does not +suddenly go from fully protected to fully evictable. + +--- + +## 4. Configuration Reference + +All fields are optional. Omitting a field (or setting it to `0`) causes the +default to apply. + +| Field | Default | Valid range | Description | +|---|---|---|---| +| `migrationCooldown` | `15m` | `≥ 0` | Minimum cooldown after any migration. `0` disables the configured floor; the adaptive duration (layer 1) still applies. | +| `maxMigrationCooldown` | `6h` | `≥ migrationCooldown` or `0` | Upper bound on the effective cooldown after all layers. `0` disables the cap. | +| `migrationHistoryWindow` | `24h` | `≥ 0` | Sliding window for migration-count history used by exponential backoff. `0` disables the window (no backoff). | + +**Validation rules:** +- `migrationCooldown` must be non-negative. +- `maxMigrationCooldown` must be non-negative. +- If both are non-zero, `maxMigrationCooldown ≥ migrationCooldown` (a cap + below the floor is a misconfiguration). +- `migrationHistoryWindow` must be non-negative. + +### Example configurations + +**Conservative — minimal interference, short memory:** +```yaml +migrationCooldown: 5m +maxMigrationCooldown: 1h +migrationHistoryWindow: 6h +``` +Suitable for clusters where workloads are expected to migrate frequently for +legitimate reasons (e.g. scheduled maintenance windows) and operators do not +want the backoff to accumulate. + +**Default — balanced protection:** +```yaml +# All defaults; these values are applied automatically when fields are omitted. +migrationCooldown: 15m +maxMigrationCooldown: 6h +migrationHistoryWindow: 24h +``` + +**Protective — strong churn resistance:** +```yaml +migrationCooldown: 30m +maxMigrationCooldown: 12h +migrationHistoryWindow: 48h +``` +Suitable for clusters with large memory-intensive VMs where each migration is +expensive and operators want the descheduler to back off aggressively after +repeated evictions. + +--- + +## 5. Profile Setup + +The plugin must be listed under **both** `filter` and `preEvictionFilter` in +the descheduler profile. Listing it under only one extension point silently +disables the other; there is no error. + +The `DefaultEvictor` is automatically injected by the descheduler and does not +need to appear in the `plugins` section. It does however need an explicit +`pluginConfig` entry if you want non-default behaviour. For KubeVirt workloads +`nodeFit: true` is strongly recommended: it makes the descheduler verify that a +suitable destination node exists before issuing an eviction, preventing a VM +from being evicted into a situation where the scheduler has nowhere valid to +place it. + +```yaml +apiVersion: "descheduler/v1alpha2" +kind: "DeschedulerPolicy" +profiles: + - name: KubevirtRelieveAndMigrate + pluginConfig: + - name: KubevirtMigrationAware + args: + migrationCooldown: 15m + maxMigrationCooldown: 6h + migrationHistoryWindow: 24h + - name: DefaultEvictor + args: + nodeFit: true # only evict when a valid destination node exists + - name: LowNodeUtilization + args: + thresholds: + MetricResource: 10 + targetThresholds: + MetricResource: 10 + useDeviationThresholds: true + plugins: + filter: + enabled: + - KubevirtMigrationAware # hard-blocks eviction during active migration + preEvictionFilter: + enabled: + - KubevirtMigrationAware # soft-defers eviction during cooldown + balance: + enabled: + - LowNodeUtilization +``` + +The RBAC for the descheduler's service account must include `list` and `watch` +on `virtualmachineinstances` in all namespaces: + +```yaml +- apiGroups: ["kubevirt.io"] + resources: ["virtualmachineinstances"] + verbs: ["list", "watch"] +``` + +--- + +## 6. Observability + +### Metrics + +The plugin registers two Prometheus metrics. + +**`descheduler_kubevirt_eviction_blocks_total`** — counter + +Incremented each time the plugin prevents an eviction, labelled by `reason`, +`node`, and `namespace`. + +| Label | Values | Description | +|---|---|---| +| `reason` | `migration_in_progress` | Blocked by `Filter`: VMI has an active migration. | +| `reason` | `cooldown` | Blocked by `PreEvictionFilter`: VMI is within its cooldown window. | +| `node` | node name | The node the `virt-launcher` pod was scheduled on. | +| `namespace` | namespace name | The namespace of the `virt-launcher` pod and VMI. | + +**`descheduler_kubevirt_effective_cooldown_seconds`** — histogram + +Recorded each time a cooldown block is applied, capturing the effective +cooldown duration in seconds. Bucket boundaries correspond to the exponential +backoff steps under default configuration: +`900` (15 m), `1800` (30 m), `3600` (1 h), `7200` (2 h), `14400` (4 h), +`21600` (6 h). + +If most observations land in the `900` bucket, the plugin is applying only the +base cooldown — backoff is not engaging significantly. If observations shift +toward `21600`, many VMs are hitting the cap, indicating sustained churn that +warrants operator attention. + +### Useful PromQL queries + +**Is the plugin actively blocking evictions on any node?** +```promql +rate(descheduler_kubevirt_eviction_blocks_total[10m]) > 0 +``` + +**Which nodes are seeing the most cooldown blocks over the last hour?** +```promql +topk(10, + increase(descheduler_kubevirt_eviction_blocks_total{reason="cooldown"}[1h]) +) +``` + +**Are VMs hitting the cap (6 h bucket) — sign of a churn loop?** +```promql +increase(descheduler_kubevirt_effective_cooldown_seconds_bucket{le="21600"}[1h]) +/ +increase(descheduler_kubevirt_effective_cooldown_seconds_count[1h]) +``` +A ratio close to 1.0 means most blocked evictions are at or below 6 hours. +A low ratio means many blocks are coming from the base 15-minute cooldown — +normal expected behaviour. + +**Suggested alert — sustained blocking on a single node:** +```promql +rate(descheduler_kubevirt_eviction_blocks_total{reason="cooldown"}[30m]) > 0.1 +``` +This fires if a node is seeing more than one cooldown block every ~10 minutes +over a 30-minute window, which may indicate that all VMs on that node are in +cooldown and the descheduler cannot rebalance it. + +--- + +## 7. Scheduler–Descheduler Decoupling + +The descheduler and the scheduler are independent, stateless components with +no shared state. When the descheduler evicts a `virt-launcher` pod from an +overloaded node it is making a bet: it hopes the scheduler will place the new +pod somewhere better, but it has no control over — and no visibility into — +where that placement will actually land. + +The **soft-tainter** (a separate operator component) tries to close this +information gap by applying `PreferNoSchedule` taints to overloaded nodes, +nudging the scheduler away from them. This works well for most workloads, but +a VM that carries specific scheduling constraints — `nodeSelector`, `nodeAffinity`, +pod affinity rules, or a narrow set of tolerated taints — can bypass +`PreferNoSchedule` entirely and land back on a suboptimal node regardless. + +When that happens this plugin's cooldown mechanism provides a backstop: the VM +that just migrated onto the wrong node will not be immediately re-evicted in a +tight loop. The exponential backoff described in §3.2 makes each successive +eviction of the same VM progressively harder, giving the cluster time to +stabilise or for an operator to intervene. + +However, the cooldown only protects the specific VM that landed badly. The +descheduler still sees an overloaded node and will continue to evict *other* +eligible VMs from it, which may or may not improve the situation depending on +their scheduling constraints. The fundamental fix for mis-placed VMs is +correct scheduling configuration, not rate-limiting. + +--- + +## 8. Known Limitations + +**Migration history (Tier B) is in-memory and lost on restart.** +The 24-hour migration history that drives exponential backoff (§3, Tier B) is +stored in the plugin's process memory. If the descheduler pod is restarted — +rolling update, OOM kill, node eviction — the history is reset. VMs that had +accumulated backoff appear clean again, and the descheduler may trigger a burst +of migrations immediately after restart. + +The VMI-persisted state (§3, Tier A) — the last migration's start and end +timestamps on the VMI object — is unaffected by a descheduler restart and +continues to enforce the base adaptive cooldown. The window of vulnerability +is therefore bounded: the per-VM base cooldown (layer 1) remains intact; only +the churn-history multiplier (layer 2) is lost until the in-memory history +rebuilds. + +**All migrations are counted, not only descheduler-caused ones.** +The informer handler fires on every `endTimestamp` transition, including +migrations triggered by node drain, KubeVirt's own resource management, or +manual operator actions. A VM that was legitimately drained for hardware +maintenance will enter the backoff history alongside descheduler-driven +evictions. This is conservative: the plugin may protect a VM that does not +strictly need protection, but it will never fail to protect one that does. + +**Cooldown protects individual VMs but does not reduce total migration volume.** +If 50 VMs on a cluster are in cooldown, the descheduler selects the other 50 +as candidates. The total number of migrations on the cluster does not decrease; +the benefit is that no single VM is churned repeatedly. For clusters where the +outlier threshold keeps finding outliers under sustained load, raising the +outlier margin is a more effective lever than this plugin alone. + +**Mixed VM sizes can produce structural non-convergence.** +In a cluster with heterogeneous VM sizes (e.g. 144 × 1 Gi VMs and 1 × 64 Gi +VM), the descheduler may never reach a stable state: migrating the large VM +sharply changes utilisation on both source and destination, which can flip +which nodes satisfy the outlier predicate on the next cycle, causing +back-and-forth movement. This plugin's adaptive cooldown gives the large VM +more protection (its migration takes longer, so its layer-1 cooldown is +larger), but the surrounding small VMs are still subject to ongoing eviction. +True convergence in this scenario requires VM-size-aware eviction selection, +which is outside the scope of this plugin. + +--- + +## 9. Fail-Open Contract + +Every code path that cannot retrieve or parse VMI state allows the eviction to +proceed rather than blocking it. Specifically: + +- Pod has no `kubevirt.io/domain` annotation → not a `virt-launcher` pod → pass through. +- VMI not found in informer cache (cache miss, different namespace, VMI deleted) → pass through. +- VMI object is not of type `*unstructured.Unstructured` → pass through. +- `startTimestamp` or `endTimestamp` is absent or not valid RFC 3339 → treat as no migration record → pass through. + +The plugin never hard-fails in a way that would prevent evictions of unrelated +workloads. If KubeVirt is uninstalled after the plugin starts, the informer +cache goes stale but continues to serve the last known state; cache misses for +new VMIs fail open. + +The only hard failure is at startup: if the VMI informer cache does not sync +within 30 seconds, the plugin returns an error and the descheduler does not +start. This is intentional — operating with a permanently empty or stale cache +would silently remove all VM protections. diff --git a/pkg/framework/plugins/kubevirtmigrationaware/defaults.go b/pkg/framework/plugins/kubevirtmigrationaware/defaults.go new file mode 100644 index 0000000000..bed5900765 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/defaults.go @@ -0,0 +1,47 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +const ( + defaultMigrationCooldown = 15 * time.Minute + defaultMaxMigrationCooldown = 6 * time.Hour + defaultMigrationHistoryWindow = 24 * time.Hour +) + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +func SetDefaults_KubevirtMigrationAwareArgs(obj runtime.Object) { + args := obj.(*KubevirtMigrationAwareArgs) + if args.MigrationCooldown.Duration == 0 { + args.MigrationCooldown = metav1.Duration{Duration: defaultMigrationCooldown} + } + if args.MaxMigrationCooldown.Duration == 0 { + args.MaxMigrationCooldown = metav1.Duration{Duration: defaultMaxMigrationCooldown} + } + if args.MigrationHistoryWindow.Duration == 0 { + args.MigrationHistoryWindow = metav1.Duration{Duration: defaultMigrationHistoryWindow} + } +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go b/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go new file mode 100644 index 0000000000..6e03073fea --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/defaults_test.go @@ -0,0 +1,55 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "testing" + "time" +) + +func TestSetDefaults(t *testing.T) { + t.Run("zero value gets all defaults", func(t *testing.T) { + args := &KubevirtMigrationAwareArgs{} + SetDefaults_KubevirtMigrationAwareArgs(args) + if args.MigrationCooldown.Duration != defaultMigrationCooldown { + t.Errorf("MigrationCooldown = %v, want %v", args.MigrationCooldown.Duration, defaultMigrationCooldown) + } + if args.MaxMigrationCooldown.Duration != defaultMaxMigrationCooldown { + t.Errorf("MaxMigrationCooldown = %v, want %v", args.MaxMigrationCooldown.Duration, defaultMaxMigrationCooldown) + } + if args.MigrationHistoryWindow.Duration != defaultMigrationHistoryWindow { + t.Errorf("MigrationHistoryWindow = %v, want %v", args.MigrationHistoryWindow.Duration, defaultMigrationHistoryWindow) + } + }) + + t.Run("explicit values are preserved", func(t *testing.T) { + args := &KubevirtMigrationAwareArgs{} + args.MigrationCooldown.Duration = 10 * time.Minute + args.MaxMigrationCooldown.Duration = 4 * time.Hour + args.MigrationHistoryWindow.Duration = 12 * time.Hour + SetDefaults_KubevirtMigrationAwareArgs(args) + if args.MigrationCooldown.Duration != 10*time.Minute { + t.Errorf("MigrationCooldown = %v, want 10m", args.MigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration != 4*time.Hour { + t.Errorf("MaxMigrationCooldown = %v, want 4h", args.MaxMigrationCooldown.Duration) + } + if args.MigrationHistoryWindow.Duration != 12*time.Hour { + t.Errorf("MigrationHistoryWindow = %v, want 12h", args.MigrationHistoryWindow.Duration) + } + }) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/doc.go b/pkg/framework/plugins/kubevirtmigrationaware/doc.go new file mode 100644 index 0000000000..d94f6d2c07 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// +k8s:defaulter-gen=TypeMeta + +package kubevirtmigrationaware diff --git a/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go new file mode 100644 index 0000000000..c0ac3a3273 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware.go @@ -0,0 +1,408 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kubevirtmigrationaware provides an EvictorPlugin that prevents the +// descheduler from evicting virt-launcher pods while a VM live-migration is in +// progress (Filter) and suppresses re-eviction during a configurable cooldown +// period after the migration completes (PreEvictionFilter). +// +// Both extension points operate on the per-VMI migrationState recorded in the +// VMI status, which is kept in a local informer cache to avoid API-server load +// in the hot eviction path. +package kubevirtmigrationaware + +import ( + "context" + "fmt" + "sort" + "sync" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + k8smetrics "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + + frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" +) + +const ( + PluginName = "KubevirtMigrationAware" + + // virt-launcher pods carry this annotation with the VMI name as value. + vmiAnnotationKey = "kubevirt.io/domain" + + // VMI GVR in the kubevirt.io API group. + vmiGroup = "kubevirt.io" + vmiVersion = "v1" + vmiResource = "virtualmachineinstances" + + // Timeout for the initial informer cache sync at plugin startup. + cacheWarmupTimeout = 30 * time.Second + + reasonMigrationInProgress = "migration_in_progress" + reasonCooldown = "cooldown" +) + +var ( + vmiGVR = schema.GroupVersionResource{ + Group: vmiGroup, + Version: vmiVersion, + Resource: vmiResource, + } + + // evictionBlocksTotal counts how many times the plugin prevented an eviction, + // labelled by reason, node, and namespace. Use this to identify which nodes + // or tenant namespaces are experiencing repeated eviction gating. + evictionBlocksTotal = k8smetrics.NewCounterVec( + &k8smetrics.CounterOpts{ + Subsystem: "descheduler", + Name: "kubevirt_eviction_blocks_total", + Help: "Number of virt-launcher pod evictions blocked by KubevirtMigrationAware, by reason, node, and namespace.", + StabilityLevel: k8smetrics.ALPHA, + }, + []string{"reason", "node", "namespace"}, + ) + + // effectiveCooldownSeconds is a histogram of the adaptive cooldown durations + // applied when deferring evictions. Bucket boundaries match the exponential + // backoff steps with default configuration (15m base, 6h cap), so the + // distribution directly shows whether VMs are hitting the base cooldown or + // being pushed toward the cap by repeated migrations. + effectiveCooldownSeconds = k8smetrics.NewHistogram( + &k8smetrics.HistogramOpts{ + Subsystem: "descheduler", + Name: "kubevirt_effective_cooldown_seconds", + Help: "Distribution of effective cooldown durations applied when deferring virt-launcher pod evictions, in seconds.", + StabilityLevel: k8smetrics.ALPHA, + Buckets: []float64{900, 1800, 3600, 7200, 14400, 21600}, // 15m 30m 1h 2h 4h 6h + }, + ) + + registerMetricsOnce sync.Once +) + +// migrationHistory tracks per-VMI migration completion timestamps within a +// sliding window so that the plugin can apply exponential backoff to VMs that +// are migrated repeatedly. +type migrationHistory struct { + mu sync.Mutex + completions map[types.UID][]time.Time // sorted ascending; pruned lazily +} + +func newMigrationHistory() *migrationHistory { + return &migrationHistory{completions: make(map[types.UID][]time.Time)} +} + +// record appends a migration completion event for the given VMI. +func (h *migrationHistory) record(uid types.UID, t time.Time) { + h.mu.Lock() + defer h.mu.Unlock() + h.completions[uid] = append(h.completions[uid], t) +} + +// countAndPrune returns the number of migrations recorded for uid within +// window, pruning expired entries in the process. +func (h *migrationHistory) countAndPrune(uid types.UID, window time.Duration) int { + h.mu.Lock() + defer h.mu.Unlock() + ts := h.completions[uid] + if len(ts) == 0 { + return 0 + } + cutoff := time.Now().Add(-window) + i := sort.Search(len(ts), func(i int) bool { return ts[i].After(cutoff) }) + if i == len(ts) { + delete(h.completions, uid) + return 0 + } + if i > 0 { + h.completions[uid] = ts[i:] + } + return len(h.completions[uid]) +} + +// onVMIUpdate is the informer UpdateFunc handler. It records a migration +// completion when endTimestamp transitions to a new non-empty value. +func (h *migrationHistory) onVMIUpdate(oldObj, newObj interface{}) { + oldU, ok := oldObj.(*unstructured.Unstructured) + if !ok { + return + } + newU, ok := newObj.(*unstructured.Unstructured) + if !ok { + return + } + oldEnd, _, _ := unstructured.NestedString(oldU.Object, "status", "migrationState", "endTimestamp") + newEnd, _, _ := unstructured.NestedString(newU.Object, "status", "migrationState", "endTimestamp") + if newEnd == "" || newEnd == oldEnd { + return + } + t, err := time.Parse(time.RFC3339, newEnd) + if err != nil { + t = time.Now() + } + h.record(newU.GetUID(), t) +} + +// KubevirtMigrationAware is an EvictorPlugin. +// +// - Filter: hard-blocks eviction of virt-launcher pods whose VMI is +// currently mid-migration (startTimestamp set, endTimestamp absent). +// +// - PreEvictionFilter: soft-blocks eviction of virt-launcher pods whose VMI +// completed a migration within the configured MigrationCooldown window, +// allowing the eviction loop to skip and try other candidates instead. +type KubevirtMigrationAware struct { + logger klog.Logger + handle frameworktypes.Handle + args *KubevirtMigrationAwareArgs + vmiLister cache.GenericLister + history *migrationHistory +} + +var _ frameworktypes.EvictorPlugin = &KubevirtMigrationAware{} + +// New builds the plugin from its arguments. +// It creates a dedicated dynamic client and VMI informer so that Filter and +// PreEvictionFilter can read VMI state from a local cache instead of hitting +// the API server on every eviction decision. +func New(ctx context.Context, args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plugin, error) { + kmaArgs, ok := args.(*KubevirtMigrationAwareArgs) + if !ok { + return nil, fmt.Errorf("want args to be of type KubevirtMigrationAwareArgs, got %T", args) + } + + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("failed to build in-cluster REST config: %w", err) + } + + dynClient, err := dynamic.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("failed to create dynamic client: %w", err) + } + + // Create an all-namespaces informer factory with no re-sync (0 = disabled). + factory := dynamicinformer.NewDynamicSharedInformerFactory(dynClient, 0) + vmiGenericInformer := factory.ForResource(vmiGVR) + + history := newMigrationHistory() + if _, err = vmiGenericInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj interface{}) { + history.onVMIUpdate(oldObj, newObj) + }, + }); err != nil { + return nil, fmt.Errorf("failed to register VMI event handler: %w", err) + } + + factory.Start(ctx.Done()) + + syncCtx, cancel := context.WithTimeout(ctx, cacheWarmupTimeout) + defer cancel() + if !cache.WaitForCacheSync(syncCtx.Done(), vmiGenericInformer.Informer().HasSynced) { + return nil, fmt.Errorf("timed out waiting for VMI informer cache to sync (is KubeVirt installed?)") + } + + return newPlugin(ctx, kmaArgs, handle, vmiGenericInformer.Lister(), history) +} + +// newPlugin is the internal constructor used by both New (production) and tests. +// Tests call this directly with a pre-populated fake lister, bypassing the +// dynamic client and in-cluster config entirely. +func newPlugin(ctx context.Context, args *KubevirtMigrationAwareArgs, handle frameworktypes.Handle, lister cache.GenericLister, history *migrationHistory) (frameworktypes.Plugin, error) { + logger := klog.FromContext(ctx).WithValues("plugin", PluginName) + + registerMetricsOnce.Do(func() { + legacyregistry.MustRegister(evictionBlocksTotal, effectiveCooldownSeconds) + }) + + logger.V(2).Info("VMI lister ready", + "cooldown", args.MigrationCooldown.Duration, + "maxCooldown", args.MaxMigrationCooldown.Duration, + "historyWindow", args.MigrationHistoryWindow.Duration) + + return &KubevirtMigrationAware{ + logger: logger, + handle: handle, + args: args, + vmiLister: lister, + history: history, + }, nil +} + +// Name returns the plugin name. +func (k *KubevirtMigrationAware) Name() string { + return PluginName +} + +// Filter returns false (block eviction) for virt-launcher pods whose +// corresponding VMI is currently mid-migration. Non-virt-launcher pods and +// any VMI lookup failure are passed through (fail open). +func (k *KubevirtMigrationAware) Filter(pod *v1.Pod) bool { + uObj, ok := k.getVMI(pod) + if !ok { + return true + } + + if migrationInProgress(uObj) { + k.logger.V(3).Info("VMI migration in progress, blocking eviction", + "pod", klog.KObj(pod), "vmi", pod.Annotations[vmiAnnotationKey], "node", pod.Spec.NodeName) + evictionBlocksTotal.WithLabelValues(reasonMigrationInProgress, pod.Spec.NodeName, pod.Namespace).Inc() + return false + } + + return true +} + +// PreEvictionFilter returns false (defer eviction) for virt-launcher pods +// whose corresponding VMI completed a migration within the effective cooldown +// window. +// +// The effective cooldown is computed in three steps: +// 1. Base: max(MigrationCooldown, migration duration) — heavier VMs get longer +// protection automatically. +// 2. Exponential backoff: the base is doubled for each additional migration +// recorded in the 6-hour history window, so repeatedly migrated VMs are +// progressively protected against churn. +// 3. Cap: if MaxMigrationCooldown is non-zero, the result is capped there. +func (k *KubevirtMigrationAware) PreEvictionFilter(pod *v1.Pod) bool { + uObj, ok := k.getVMI(pod) + if !ok { + return true + } + + endTime, ok := migrationEndTime(uObj) + if !ok { + return true + } + + // Step 1: base = max(configured, migration duration). + effectiveCooldown := k.args.MigrationCooldown.Duration + if startTime, hasStart := migrationStartTime(uObj); hasStart { + if d := endTime.Sub(*startTime); d > effectiveCooldown { + effectiveCooldown = d + } + } + + // Step 2: double the cooldown for each migration beyond the first recorded + // in the history window. Uses an overflow-safe doubling loop. + count := k.history.countAndPrune(uObj.GetUID(), k.args.MigrationHistoryWindow.Duration) + for i := 1; i < count; i++ { + next := effectiveCooldown * 2 + if next/2 != effectiveCooldown { // int64 overflow guard + break + } + effectiveCooldown = next + if max := k.args.MaxMigrationCooldown.Duration; max > 0 && effectiveCooldown >= max { + break // cap will be applied in step 3; no point doubling further + } + } + + // Step 3: apply the optional upper bound. + if maxCooldown := k.args.MaxMigrationCooldown.Duration; maxCooldown > 0 && effectiveCooldown > maxCooldown { + effectiveCooldown = maxCooldown + } + + elapsed := time.Since(*endTime) + if elapsed < effectiveCooldown { + remaining := effectiveCooldown - elapsed + k.logger.V(3).Info("VMI in migration cooldown, deferring eviction", + "pod", klog.KObj(pod), "vmi", pod.Annotations[vmiAnnotationKey], "node", pod.Spec.NodeName, + "migrationCount", count, + "elapsed", elapsed.Round(time.Second), + "effectiveCooldown", effectiveCooldown.Round(time.Second), + "remaining", remaining.Round(time.Second)) + evictionBlocksTotal.WithLabelValues(reasonCooldown, pod.Spec.NodeName, pod.Namespace).Inc() + effectiveCooldownSeconds.Observe(effectiveCooldown.Seconds()) + return false + } + + return true +} + +// getVMI looks up the VMI for a virt-launcher pod from the informer cache. +// Returns (nil, false) for non-virt-launcher pods and on any lookup error +// (fail open: the pod is not blocked from eviction). +func (k *KubevirtMigrationAware) getVMI(pod *v1.Pod) (*unstructured.Unstructured, bool) { + vmiName, ok := pod.Annotations[vmiAnnotationKey] + if !ok { + return nil, false + } + + rObj, err := k.vmiLister.ByNamespace(pod.Namespace).Get(vmiName) + if err != nil { + k.logger.V(4).Info("VMI not found in cache, allowing eviction", + "pod", klog.KObj(pod), "vmi", vmiName, "err", err) + return nil, false + } + + uObj, ok := rObj.(*unstructured.Unstructured) + if !ok { + k.logger.V(4).Info("Unexpected VMI object type, allowing eviction", + "pod", klog.KObj(pod), "vmi", vmiName, "type", fmt.Sprintf("%T", rObj)) + return nil, false + } + + return uObj, true +} + +// migrationInProgress returns true when the VMI has a migration that has +// started but not yet finished (startTimestamp present, endTimestamp absent). +func migrationInProgress(uObj *unstructured.Unstructured) bool { + startTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "startTimestamp") + if !found || startTS == "" { + return false + } + endTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "endTimestamp") + return !found || endTS == "" +} + +// migrationStartTime returns the time at which the last migration started. +// Returns (nil, false) when no startTimestamp is recorded or it is malformed. +func migrationStartTime(uObj *unstructured.Unstructured) (*time.Time, bool) { + startTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "startTimestamp") + if !found || startTS == "" { + return nil, false + } + t, err := time.Parse(time.RFC3339, startTS) + if err != nil { + return nil, false + } + return &t, true +} + +// migrationEndTime returns the time at which the last migration completed. +// Returns (nil, false) when there is no completed migration record. +func migrationEndTime(uObj *unstructured.Unstructured) (*time.Time, bool) { + endTS, found, _ := unstructured.NestedString(uObj.Object, "status", "migrationState", "endTimestamp") + if !found || endTS == "" { + return nil, false + } + t, err := time.Parse(time.RFC3339, endTS) + if err != nil { + return nil, false + } + return &t, true +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go new file mode 100644 index 0000000000..f07b1f6f36 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/kubevirtmigrationaware_test.go @@ -0,0 +1,592 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "context" + "testing" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" +) + +// makeVMI builds an unstructured VMI object with optional migrationState. +// migrationState may be nil (no migration ever ran), a map with only +// "startTimestamp" (in progress), or a map with both timestamps (completed). +// No kubevirt imports are needed: the object is just a plain nested map. +func makeVMI(namespace, name string, migrationState map[string]interface{}) *unstructured.Unstructured { + obj := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "kubevirt.io/v1", + "kind": "VirtualMachineInstance", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + }, + } + if migrationState != nil { + obj.Object["status"] = map[string]interface{}{ + "migrationState": migrationState, + } + } + return obj +} + +// inProgressState returns a migrationState map representing an ongoing migration. +func inProgressState(start time.Time) map[string]interface{} { + return map[string]interface{}{ + "startTimestamp": start.UTC().Format(time.RFC3339), + } +} + +// completedState returns a migrationState map representing a finished migration. +func completedState(start, end time.Time) map[string]interface{} { + return map[string]interface{}{ + "startTimestamp": start.UTC().Format(time.RFC3339), + "endTimestamp": end.UTC().Format(time.RFC3339), + } +} + +// makeVirtLauncherPod returns a pod that carries the kubevirt.io/domain annotation +// linking it to a VMI, as a real virt-launcher pod would. +func makeVirtLauncherPod(namespace, name, nodeName, vmiName string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Annotations: map[string]string{ + vmiAnnotationKey: vmiName, + }, + }, + Spec: v1.PodSpec{NodeName: nodeName}, + } +} + +// makePlainPod returns a pod with no kubevirt annotation (e.g. a regular workload). +func makePlainPod(namespace, name, nodeName string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, + Spec: v1.PodSpec{NodeName: nodeName}, + } +} + +// makeVMILister builds a cache.GenericLister pre-populated with the given VMIs. +// It mirrors exactly what the production dynamic informer would serve, without +// any dynamic client or network calls. +func makeVMILister(vmis ...*unstructured.Unstructured) cache.GenericLister { + indexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{ + cache.NamespaceIndex: cache.MetaNamespaceIndexFunc, + }) + for _, vmi := range vmis { + _ = indexer.Add(vmi) + } + return cache.NewGenericLister(indexer, vmiGVR.GroupResource()) +} + +// newTestPlugin is a convenience wrapper that calls the internal constructor +// with a fake lister. Pass maxCooldown=0 to disable the adaptive cap. +func newTestPlugin(t *testing.T, cooldown, maxCooldown time.Duration, vmis ...*unstructured.Unstructured) *KubevirtMigrationAware { + t.Helper() + args := &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: cooldown}, + MaxMigrationCooldown: metav1.Duration{Duration: maxCooldown}, + MigrationHistoryWindow: metav1.Duration{Duration: defaultMigrationHistoryWindow}, + } + pg, err := newPlugin(context.Background(), args, nil, makeVMILister(vmis...), newMigrationHistory()) + if err != nil { + t.Fatalf("newPlugin: %v", err) + } + return pg.(*KubevirtMigrationAware) +} + +// ── Filter ──────────────────────────────────────────────────────────────────── + +func TestFilter(t *testing.T) { + const ns = "default" + now := time.Now() + + cases := []struct { + description string + vmis []*unstructured.Unstructured + pod *v1.Pod + wantAllow bool + }{ + { + description: "non-virt-launcher pod (no annotation) is always allowed", + pod: makePlainPod(ns, "plain-pod", "node-1"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI is absent from cache is allowed (fail open)", + pod: makeVirtLauncherPod(ns, "virt-launcher-a", "node-1", "vm-a"), + // no VMIs added to the lister + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI has no migration history is allowed", + vmis: []*unstructured.Unstructured{makeVMI(ns, "vm-b", nil)}, + pod: makeVirtLauncherPod(ns, "virt-launcher-b", "node-1", "vm-b"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI has a completed migration is allowed", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-c", completedState(now.Add(-10*time.Minute), now.Add(-5*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-c", "node-2", "vm-c"), + wantAllow: true, + }, + { + description: "virt-launcher pod whose VMI migration is in progress is blocked", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-d", inProgressState(now.Add(-2*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-d", "node-1", "vm-d"), + wantAllow: false, + }, + { + description: "pod in different namespace from VMI cache entry is allowed (cache miss)", + vmis: []*unstructured.Unstructured{ + makeVMI("other-ns", "vm-e", inProgressState(now.Add(-1*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-e", "node-1", "vm-e"), + wantAllow: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, 5*time.Minute, 0, tc.vmis...) + got := plugin.Filter(tc.pod) + if got != tc.wantAllow { + t.Errorf("Filter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── PreEvictionFilter ───────────────────────────────────────────────────────── + +func TestPreEvictionFilter(t *testing.T) { + const ( + ns = "default" + cooldown = 5 * time.Minute + ) + now := time.Now() + + cases := []struct { + description string + vmis []*unstructured.Unstructured + pod *v1.Pod + wantAllow bool + }{ + { + description: "non-virt-launcher pod is always allowed", + pod: makePlainPod(ns, "plain-pod", "node-1"), + wantAllow: true, + }, + { + description: "VMI absent from cache is allowed (fail open)", + pod: makeVirtLauncherPod(ns, "virt-launcher-a", "node-1", "vm-a"), + wantAllow: true, + }, + { + description: "VMI with no migration history is allowed", + vmis: []*unstructured.Unstructured{makeVMI(ns, "vm-b", nil)}, + pod: makeVirtLauncherPod(ns, "virt-launcher-b", "node-1", "vm-b"), + wantAllow: true, + }, + { + description: "VMI whose migration ended just within the cooldown window is deferred", + vmis: []*unstructured.Unstructured{ + // ended 1 minute ago; cooldown is 5 minutes → still blocked + makeVMI(ns, "vm-c", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-c", "node-2", "vm-c"), + wantAllow: false, + }, + { + description: "VMI whose migration ended exactly at the cooldown boundary is allowed", + vmis: []*unstructured.Unstructured{ + // ended 5 minutes + 1 second ago → just past the cooldown + makeVMI(ns, "vm-d", completedState(now.Add(-10*time.Minute), now.Add(-(cooldown+time.Second)))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-d", "node-2", "vm-d"), + wantAllow: true, + }, + { + description: "VMI whose migration ended well before the cooldown window is allowed", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-e", completedState(now.Add(-30*time.Minute), now.Add(-20*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-e", "node-3", "vm-e"), + wantAllow: true, + }, + { + description: "VMI mid-migration has no endTimestamp so is allowed by PreEvictionFilter (Filter handles this)", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-f", inProgressState(now.Add(-2*time.Minute))), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-f", "node-1", "vm-f"), + wantAllow: true, + }, + { + description: "malformed endTimestamp is treated as no timestamp (fail open)", + vmis: []*unstructured.Unstructured{ + makeVMI(ns, "vm-g", map[string]interface{}{ + "startTimestamp": now.Add(-10 * time.Minute).UTC().Format(time.RFC3339), + "endTimestamp": "not-a-valid-timestamp", + }), + }, + pod: makeVirtLauncherPod(ns, "virt-launcher-g", "node-1", "vm-g"), + wantAllow: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, cooldown, 0, tc.vmis...) + got := plugin.PreEvictionFilter(tc.pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── Cooldown duration is respected ─────────────────────────────────────────── + +func TestPreEvictionFilterRespectsConfiguredCooldown(t *testing.T) { + const ns = "default" + now := time.Now() + endedAgo := 3 * time.Minute + + // A 1-minute migration ensures the configured cooldown always dominates + // (migration duration < any cooldown under test). + vmi := makeVMI(ns, "vm-1", completedState(now.Add(-(endedAgo+time.Minute)), now.Add(-endedAgo))) + pod := makeVirtLauncherPod(ns, "virt-launcher-1", "node-1", "vm-1") + + // With a 5-minute cooldown the VM (ended 3m ago) should be deferred. + t.Run("5m cooldown blocks VM ended 3m ago", func(t *testing.T) { + plugin := newTestPlugin(t, 5*time.Minute, 0, vmi) + if plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = true (allowed), want false (deferred)") + } + }) + + // With a 2-minute cooldown the same VM should be evictable (elapsed 3m > effective 2m). + t.Run("2m cooldown allows VM ended 3m ago", func(t *testing.T) { + plugin := newTestPlugin(t, 2*time.Minute, 0, vmi) + if !plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = false (deferred), want true (allowed)") + } + }) + + // With zero cooldown the adaptive cooldown equals the migration duration (1m); + // elapsed 3m > 1m so the VM is immediately evictable. + t.Run("zero cooldown allows VM ended 3m ago (1m migration)", func(t *testing.T) { + plugin := newTestPlugin(t, 0, 0, vmi) + if !plugin.PreEvictionFilter(pod) { + t.Error("PreEvictionFilter() = false (deferred), want true (allowed)") + } + }) +} + +// ── Adaptive per-VM cooldown ────────────────────────────────────────────────── + +func TestPreEvictionFilterAdaptiveCooldown(t *testing.T) { + const ( + ns = "default" + configured = 15 * time.Minute + ) + now := time.Now() + + cases := []struct { + description string + migStart time.Duration // relative to now + migEnd time.Duration // relative to now + maxCooldown time.Duration // 0 = disabled + wantAllow bool + }{ + { + // Small VM: 2-minute migration — configured 15m dominates. + // Ended 16m ago → elapsed(16m) > effective(15m) → allowed. + description: "small VM: configured cooldown dominates, elapsed past it", + migStart: -20 * time.Minute, + migEnd: -18 * time.Minute, // duration = 2m + wantAllow: true, + }, + { + // Small VM: same 2-minute migration, but ended only 10m ago. + // effective = max(15m, 2m) = 15m; elapsed(10m) < 15m → blocked. + description: "small VM: configured cooldown dominates, still within window", + migStart: -25 * time.Minute, + migEnd: -10 * time.Minute, // duration = 15m; elapsed = 10m + wantAllow: false, + }, + { + // Large VM: 30-minute migration — duration dominates over 15m. + // Ended 5m ago → elapsed(5m) < effective(30m) → blocked. + description: "large VM: migration duration dominates, still within window", + migStart: -35 * time.Minute, + migEnd: -5 * time.Minute, // duration = 30m + wantAllow: false, + }, + { + // Large VM: 30-minute migration, ended 31m ago. + // effective = 30m; elapsed(31m) > 30m → allowed. + description: "large VM: migration duration dominates, elapsed past it", + migStart: -61 * time.Minute, + migEnd: -31 * time.Minute, // duration = 30m + wantAllow: true, + }, + { + // Large VM with cap: 30-minute migration capped at 20m. + // effective = min(max(15m, 30m), 20m) = 20m; ended 5m ago → blocked. + description: "large VM: cap applied, still within capped window", + migStart: -35 * time.Minute, + migEnd: -5 * time.Minute, // duration = 30m; cap = 20m + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + { + // Large VM with cap: same migration, but ended 21m ago. + // effective = 20m (capped); elapsed(21m) > 20m → allowed. + description: "large VM: cap applied, elapsed past capped window", + migStart: -56 * time.Minute, + migEnd: -21 * time.Minute, // duration = 35m; cap = 20m + maxCooldown: 20 * time.Minute, + wantAllow: true, + }, + { + // No startTimestamp: adaptive path skipped, falls back to configured cooldown. + // effective = 15m; ended 10m ago → blocked. + description: "missing startTimestamp: falls back to configured cooldown", + migStart: 0, // sentinel: will be omitted from migrationState + migEnd: -10 * time.Minute, + wantAllow: false, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + var state map[string]interface{} + if tc.migStart == 0 { + // Only endTimestamp, no startTimestamp. + state = map[string]interface{}{ + "endTimestamp": now.Add(tc.migEnd).UTC().Format(time.RFC3339), + } + } else { + state = completedState(now.Add(tc.migStart), now.Add(tc.migEnd)) + } + vmi := makeVMI(ns, "vm-adaptive", state) + pod := makeVirtLauncherPod(ns, "virt-launcher-adaptive", "node-1", "vm-adaptive") + + plugin := newTestPlugin(t, configured, tc.maxCooldown, vmi) + got := plugin.PreEvictionFilter(pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── Exponential backoff from migration frequency ────────────────────────────── + +func TestPreEvictionFilterExponentialBackoff(t *testing.T) { + const ( + ns = "default" + cooldown = 15 * time.Minute + vmiUID = types.UID("uid-vm-backoff") + ) + now := time.Now() + + // A 10-second migration that ended 5 minutes ago. + // Base effective cooldown = max(15m, 10s) = 15m. elapsed = 5m < 15m. + vmi := makeVMI(ns, "vm-backoff", completedState( + now.Add(-5*time.Minute-10*time.Second), + now.Add(-5*time.Minute), + )) + vmi.SetUID(vmiUID) + pod := makeVirtLauncherPod(ns, "virt-launcher-backoff", "node-1", "vm-backoff") + + cases := []struct { + description string + historyCount int // entries to pre-populate (spread over last few hours) + maxCooldown time.Duration // 0 = disabled + wantAllow bool + }{ + { + // count=0: race — current migration not yet in history; base 15m applies. + description: "count=0 (race): base cooldown, blocked", + historyCount: 0, + wantAllow: false, // 15m * 2^0 = 15m; elapsed 5m < 15m + }, + { + // count=1: one entry (current migration recorded); 2^0 doublings = 15m. + description: "count=1: first migration, base cooldown, blocked", + historyCount: 1, + wantAllow: false, // 15m * 2^0 = 15m; elapsed 5m < 15m + }, + { + // count=2: one prior + current; 2^1 doublings → 30m. + description: "count=2: one prior migration doubles cooldown to 30m, blocked", + historyCount: 2, + wantAllow: false, // 15m * 2^1 = 30m; elapsed 5m < 30m + }, + { + // count=3: two prior; 2^2 doublings → 60m. + description: "count=3: two prior migrations, cooldown 60m, blocked", + historyCount: 3, + wantAllow: false, // 15m * 2^2 = 60m; elapsed 5m < 60m + }, + { + // count=2 with max=20m: min(30m, 20m) = 20m; elapsed 5m < 20m → blocked. + description: "count=2 with max cap at 20m: capped, still blocked", + historyCount: 2, + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + { + // count=4 with max=20m: min(120m, 20m) = 20m; elapsed 5m < 20m → blocked. + description: "count=4 with max cap at 20m: cap bounds growth, blocked", + historyCount: 4, + maxCooldown: 20 * time.Minute, + wantAllow: false, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + plugin := newTestPlugin(t, cooldown, tc.maxCooldown, vmi) + // Pre-populate history: spread entries so they all fall within the 6h window. + for i := 0; i < tc.historyCount; i++ { + plugin.history.record(vmiUID, now.Add(-time.Duration(i+1)*30*time.Minute)) + } + got := plugin.PreEvictionFilter(pod) + if got != tc.wantAllow { + t.Errorf("PreEvictionFilter() = %v, want %v", got, tc.wantAllow) + } + }) + } +} + +// ── migrationHistory unit tests ─────────────────────────────────────────────── + +func TestMigrationHistory(t *testing.T) { + const ( + uid = types.UID("uid-test") + window = 24 * time.Hour + ) + now := time.Now() + + t.Run("empty history returns zero", func(t *testing.T) { + h := newMigrationHistory() + if got := h.countAndPrune(uid, window); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + }) + + t.Run("entries within window are counted", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-1*time.Hour)) + h.record(uid, now.Add(-2*time.Hour)) + if got := h.countAndPrune(uid, window); got != 2 { + t.Errorf("countAndPrune() = %d, want 2", got) + } + }) + + t.Run("entries outside the window are pruned", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-25*time.Hour)) // outside 24h window + h.record(uid, now.Add(-1*time.Hour)) // inside + if got := h.countAndPrune(uid, window); got != 1 { + t.Errorf("countAndPrune() = %d, want 1 (stale entry pruned)", got) + } + }) + + t.Run("all entries expired: map entry is deleted", func(t *testing.T) { + h := newMigrationHistory() + h.record(uid, now.Add(-25*time.Hour)) + if got := h.countAndPrune(uid, window); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + h.mu.Lock() + _, exists := h.completions[uid] + h.mu.Unlock() + if exists { + t.Error("map entry was not deleted after all entries expired") + } + }) +} + +func TestMigrationHistoryOnVMIUpdate(t *testing.T) { + const ( + ns = "default" + uid = types.UID("uid-update-test") + ) + now := time.Now() + + withUID := func(vmi *unstructured.Unstructured) *unstructured.Unstructured { + vmi.SetUID(uid) + return vmi + } + + t.Run("migration completion is recorded", func(t *testing.T) { + h := newMigrationHistory() + old := withUID(makeVMI(ns, "vmi", inProgressState(now.Add(-10*time.Minute)))) + new := withUID(makeVMI(ns, "vmi", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute)))) + h.onVMIUpdate(old, new) + if got := h.countAndPrune(uid, 24*time.Hour); got != 1 { + t.Errorf("countAndPrune() = %d, want 1", got) + } + }) + + t.Run("update with unchanged endTimestamp is not re-recorded", func(t *testing.T) { + h := newMigrationHistory() + vmi := withUID(makeVMI(ns, "vmi", completedState(now.Add(-10*time.Minute), now.Add(-1*time.Minute)))) + h.onVMIUpdate(vmi, vmi) // same object, endTimestamp unchanged + if got := h.countAndPrune(uid, 24*time.Hour); got != 0 { + t.Errorf("countAndPrune() = %d, want 0 (no transition)", got) + } + }) + + t.Run("update with no migration state is ignored", func(t *testing.T) { + h := newMigrationHistory() + old := withUID(makeVMI(ns, "vmi", nil)) + new := withUID(makeVMI(ns, "vmi", nil)) + h.onVMIUpdate(old, new) + if got := h.countAndPrune(uid, 24*time.Hour); got != 0 { + t.Errorf("countAndPrune() = %d, want 0", got) + } + }) + + t.Run("second migration completion increments count", func(t *testing.T) { + h := newMigrationHistory() + first := withUID(makeVMI(ns, "vmi", completedState(now.Add(-3*time.Hour), now.Add(-2*time.Hour)))) + second := withUID(makeVMI(ns, "vmi", completedState(now.Add(-30*time.Minute), now.Add(-5*time.Minute)))) + h.onVMIUpdate(first, second) + if got := h.countAndPrune(uid, 24*time.Hour); got != 1 { + t.Errorf("countAndPrune() = %d, want 1", got) + } + }) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/register.go b/pkg/framework/plugins/kubevirtmigrationaware/register.go new file mode 100644 index 0000000000..0db38920e2 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/register.go @@ -0,0 +1,31 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "k8s.io/apimachinery/pkg/runtime" +) + +var ( + SchemeBuilder = runtime.NewSchemeBuilder() + localSchemeBuilder = &SchemeBuilder + AddToScheme = localSchemeBuilder.AddToScheme +) + +func init() { + localSchemeBuilder.Register(addDefaultingFuncs) +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/types.go b/pkg/framework/plugins/kubevirtmigrationaware/types.go new file mode 100644 index 0000000000..c5cfd6cd62 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/types.go @@ -0,0 +1,49 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +k8s:deepcopy-gen=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object + +// KubevirtMigrationAwareArgs holds arguments used to configure the +// KubevirtMigrationAware plugin. +type KubevirtMigrationAwareArgs struct { + metav1.TypeMeta `json:",inline"` + + // MigrationCooldown is the minimum duration that must elapse after a VM + // live-migration completes before the descheduler may evict the virt-launcher + // pod again. The effective per-VM cooldown is max(MigrationCooldown, + // migration duration), so heavier VMs automatically receive longer protection. + // Defaults to 15m. + MigrationCooldown metav1.Duration `json:"migrationCooldown,omitempty"` + + // MaxMigrationCooldown caps the adaptive per-VM cooldown computed as + // max(MigrationCooldown, migration duration) after exponential backoff is + // applied. Use this to prevent pathological cases (very slow migrations or + // heavy churn) from locking a VM indefinitely. Defaults to 6h. + MaxMigrationCooldown metav1.Duration `json:"maxMigrationCooldown,omitempty"` + + // MigrationHistoryWindow is the sliding window over which past migration + // completions are counted for exponential-backoff purposes. Longer windows + // make the plugin sensitive to day-scale churn; shorter windows let VMs + // recover their clean record faster. Defaults to 24h. + MigrationHistoryWindow metav1.Duration `json:"migrationHistoryWindow,omitempty"` +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/validation.go b/pkg/framework/plugins/kubevirtmigrationaware/validation.go new file mode 100644 index 0000000000..cd868be98d --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/validation.go @@ -0,0 +1,41 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/runtime" +) + +func ValidateKubevirtMigrationAwareArgs(obj runtime.Object) error { + args := obj.(*KubevirtMigrationAwareArgs) + if args.MigrationCooldown.Duration < 0 { + return fmt.Errorf("migrationCooldown must be non-negative, got %v", args.MigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration < 0 { + return fmt.Errorf("maxMigrationCooldown must be non-negative, got %v", args.MaxMigrationCooldown.Duration) + } + if args.MaxMigrationCooldown.Duration > 0 && args.MaxMigrationCooldown.Duration < args.MigrationCooldown.Duration { + return fmt.Errorf("maxMigrationCooldown (%v) must be >= migrationCooldown (%v)", + args.MaxMigrationCooldown.Duration, args.MigrationCooldown.Duration) + } + if args.MigrationHistoryWindow.Duration < 0 { + return fmt.Errorf("migrationHistoryWindow must be non-negative, got %v", args.MigrationHistoryWindow.Duration) + } + return nil +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go b/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go new file mode 100644 index 0000000000..65a14ef0f9 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/validation_test.go @@ -0,0 +1,114 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubevirtmigrationaware + +import ( + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestValidateArgs(t *testing.T) { + cases := []struct { + description string + args *KubevirtMigrationAwareArgs + wantErr bool + }{ + { + description: "zero cooldown is valid (disables the cooldown gate)", + args: &KubevirtMigrationAwareArgs{}, + wantErr: false, + }, + { + description: "positive cooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 5 * time.Minute}, + }, + wantErr: false, + }, + { + description: "negative cooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: -1 * time.Second}, + }, + wantErr: true, + }, + { + description: "negative maxMigrationCooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MaxMigrationCooldown: metav1.Duration{Duration: -1 * time.Second}, + }, + wantErr: true, + }, + { + description: "maxMigrationCooldown below migrationCooldown is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 10 * time.Minute}, + }, + wantErr: true, + }, + { + description: "maxMigrationCooldown equal to migrationCooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + }, + wantErr: false, + }, + { + description: "maxMigrationCooldown greater than migrationCooldown is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 1 * time.Hour}, + }, + wantErr: false, + }, + { + description: "zero maxMigrationCooldown (disabled) is always valid", + args: &KubevirtMigrationAwareArgs{ + MigrationCooldown: metav1.Duration{Duration: 15 * time.Minute}, + MaxMigrationCooldown: metav1.Duration{Duration: 0}, + }, + wantErr: false, + }, + { + description: "positive migrationHistoryWindow is valid", + args: &KubevirtMigrationAwareArgs{ + MigrationHistoryWindow: metav1.Duration{Duration: 24 * time.Hour}, + }, + wantErr: false, + }, + { + description: "negative migrationHistoryWindow is invalid", + args: &KubevirtMigrationAwareArgs{ + MigrationHistoryWindow: metav1.Duration{Duration: -1 * time.Hour}, + }, + wantErr: true, + }, + } + + for _, tc := range cases { + t.Run(tc.description, func(t *testing.T) { + err := ValidateKubevirtMigrationAwareArgs(tc.args) + if (err != nil) != tc.wantErr { + t.Errorf("ValidateKubevirtMigrationAwareArgs() error = %v, wantErr = %v", err, tc.wantErr) + } + }) + } +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go new file mode 100644 index 0000000000..98893d0563 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.deepcopy.go @@ -0,0 +1,54 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package kubevirtmigrationaware + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KubevirtMigrationAwareArgs) DeepCopyInto(out *KubevirtMigrationAwareArgs) { + *out = *in + out.TypeMeta = in.TypeMeta + out.MigrationCooldown = in.MigrationCooldown + out.MaxMigrationCooldown = in.MaxMigrationCooldown + out.MigrationHistoryWindow = in.MigrationHistoryWindow + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubevirtMigrationAwareArgs. +func (in *KubevirtMigrationAwareArgs) DeepCopy() *KubevirtMigrationAwareArgs { + if in == nil { + return nil + } + out := new(KubevirtMigrationAwareArgs) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *KubevirtMigrationAwareArgs) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} diff --git a/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go new file mode 100644 index 0000000000..8f5b3c63b0 --- /dev/null +++ b/pkg/framework/plugins/kubevirtmigrationaware/zz_generated.defaults.go @@ -0,0 +1,33 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by defaulter-gen. DO NOT EDIT. + +package kubevirtmigrationaware + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + return nil +}