tmp: Don't include buffer in Reserve()

neondatabase · May 21, 2024 · c8e250e · c8e250e
1 parent 6555a77
commit c8e250e
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 8 deletions.
diff --git a/pkg/plugin/plugin.go b/pkg/plugin/plugin.go
@@ -817,7 +817,14 @@ func (e *AutoscaleEnforcer) Reserve(
 		return status
 	}
 
-	ok, verdict, err := e.reserveResources(ctx, logger, pod, "Reserve", false)
+	ok, verdict, err := e.reserveResources(ctx, logger, pod, "Reserve", reserveOptions{
+		// we *could* deny, but that's ultimately less reliable.
+		// For more, see https://github.com/neondatabase/autoscaling/issues/869
+		allowDeny: false,
+		// don't include buffer because we know that future changes by the autoscaler-agent must go
+		// through us.
+		includeBuffer: false,
+	})
 	if err != nil {
 		return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
 	}

diff --git a/pkg/plugin/state.go b/pkg/plugin/state.go
@@ -585,7 +585,18 @@ func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod) {
 
 	logger.Info("Handling Pod start event")
 
-	_, _, _ = e.reserveResources(context.TODO(), logger, pod, "Pod started", false)
+	_, _, _ = e.reserveResources(context.TODO(), logger, pod, "Pod started", reserveOptions{
+		// pod already started, out of our control - we don't have a mechanism to deny it
+		allowDeny: false,
+		// this may be a preexisting VM. If so, we should include it in "buffer" as long it's
+		// supposed to be handled by us (otherwise, the "buffer" will never be resolved)
+		includeBuffer: pod.Spec.SchedulerName == e.state.conf.SchedulerName,
+	})
+}
+
+type reserveOptions struct {
+	allowDeny     bool
+	includeBuffer bool
 }
 
 // reserveResources attempts to set aside resources on the node for the pod.
@@ -601,7 +612,7 @@ func (e *AutoscaleEnforcer) reserveResources(
 	logger *zap.Logger,
 	pod *corev1.Pod,
 	action string,
-	allowDeny bool,
+	opts reserveOptions,
 ) (ok bool, _ *verdictSet, _ error) {
 	nodeName := pod.Spec.NodeName
 
@@ -646,7 +657,7 @@ func (e *AutoscaleEnforcer) reserveResources(
 		e.metrics.IncReserveShouldDeny(pod, node)
 	}
 
-	if shouldDeny && allowDeny {
+	if shouldDeny && opts.allowDeny {
 		cpuShortVerdict := "NOT ENOUGH"
 		if add.VCPU <= node.remainingReservableCPU() {
 			cpuShortVerdict = "OK"
@@ -703,10 +714,14 @@ func (e *AutoscaleEnforcer) reserveResources(
 			Max:              vmInfo.Max().Mem,
 		}
 
-		// If scaling isn't enabled *or* the pod is involved in an ongoing migration, then we can be
-		// more precise about usage (because scaling is forbidden while migrating).
+		// If scaling isn't enabled *or* the pod is involved in an ongoing migration *or* the caller
+		// has opted out of setting Buffer, then we can be more precise about usage.
+		//
+		// Buffer exists to handle scaling that may happen due to a prior scheduler's approval.
+		// If scaling is disabled, we don't have to worry about this, and if there's an ongoing
+		// migration, scaling is forbidden.
 		migrating := util.TryPodOwnerVirtualMachineMigration(pod) != nil
-		if !vmInfo.Config.ScalingEnabled || migrating {
+		if !vmInfo.Config.ScalingEnabled || migrating || !opts.includeBuffer {
 			cpuState.Buffer = 0
 			cpuState.Reserved = vmInfo.Using().VCPU
 			memState.Buffer = 0
@@ -769,7 +784,7 @@ func (e *AutoscaleEnforcer) reserveResources(
 		}
 	}
 
-	if allowDeny {
+	if opts.allowDeny {
 		logger.Info("Allowing reserve resources for Pod", zap.Object("verdict", verdict))
 	} else if shouldDeny /* but couldn't */ {
 		logger.Warn("Reserved resources for Pod above totals", zap.Object("verdict", verdict))