pkg/operator/defragcontroller: ensure defrag has clear signal

Signed-off-by: Sam Batschelet <sbatsche@redhat.com>
openshift · Oct 13, 2021 · aa7ff87 · aa7ff87
1 parent 10c3297
commit aa7ff87
Showing 1 changed file with 21 additions and 8 deletions.
diff --git a/pkg/operator/defragcontroller/defragcontroller.go b/pkg/operator/defragcontroller/defragcontroller.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math"
 	"time"
 
 	configv1 "github.com/openshift/api/config/v1"
@@ -23,12 +24,17 @@ import (
 
 const (
 	minDefragBytes          int64   = 100 * 1024 * 1024 // 100MB
+	minDefragWaitDuration           = 36 * time.Second
 	maxFragmentedPercentage float64 = 45
-	waitDuration                    = 2 * time.Second
-	timeoutDuration                 = 30 * time.Second
+	pollWaitDuration                = 2 * time.Second
+	pollTimeoutDuration             = 45 * time.Second
+	compactionInterval              = 10 * time.Minute
+
+	defragDisabledCondition = "DefragControllerDisabled"
 )
 
-// DefragController observes the operand state file for fragmentation
+// DefragController observes the etcd state file fragmentation via Status method of Maintenance API. Based on these
+// observations the controller will perform rolling defragmentation of each etcd member in the cluster.
 type DefragController struct {
 	operatorClient       v1helpers.OperatorClient
 	etcdClient           etcdcli.EtcdClient
@@ -46,7 +52,7 @@ func NewDefragController(
 		etcdClient:           etcdClient,
 		infrastructureLister: infrastructureLister,
 	}
-	return factory.New().ResyncEvery(9*time.Minute).WithInformers(
+	return factory.New().ResyncEvery(compactionInterval+1*time.Minute).WithInformers( // attempt to sync outside of etcd compaction interval to ensure maximum gain by defragmentation.
 		operatorClient.Informer(),
 	).WithSync(c.sync).ToController("DefragController", eventRecorder.WithComponentSuffix("defrag-controller"))
 }
@@ -164,9 +170,13 @@ func (c *DefragController) checkDefrag(ctx context.Context, recorder events.Reco
 
 			// Give cluster time to recover before we move to the next member.
 			if err := wait.Poll(
-				waitDuration,
-				timeoutDuration,
+				pollWaitDuration,
+				pollTimeoutDuration,
 				func() (bool, error) {
+					// Ensure defragmentation attempts have clear observable signal.
+					klog.V(4).Infof("Sleeping to allow cluster to recover before defrag next member %v", minDefragWaitDuration)
+					time.Sleep(minDefragWaitDuration)
+
 					memberHealth, err := c.etcdClient.MemberHealth(ctx)
 					if err != nil {
 						klog.Warningf("failed checking member health: %v", err)
@@ -195,13 +205,16 @@ func isEndpointBackendFragmented(member *etcdserverpb.Member, endpointStatus *cl
 		return false
 	}
 	fragmentedPercentage := checkFragmentationPercentage(endpointStatus.DbSize, endpointStatus.DbSizeInUse)
-	klog.Infof("etcd member %q backend store fragmented: %.2f %%, dbSize: %d", member.Name, fragmentedPercentage, endpointStatus.DbSize)
+	if fragmentedPercentage > 0.00 {
+		klog.Infof("etcd member %q backend store fragmented: %.2f %%, dbSize: %d", member.Name, fragmentedPercentage, endpointStatus.DbSize)
+	}
 	return fragmentedPercentage >= maxFragmentedPercentage && endpointStatus.DbSize >= minDefragBytes
 }
 
 func checkFragmentationPercentage(ondisk, inuse int64) float64 {
 	diff := float64(ondisk - inuse)
-	return (diff / float64(ondisk)) * 100
+	fragmentedPercentage := (diff / float64(ondisk)) * 100
+	return math.Round(fragmentedPercentage*100) / 100
 }
 
 func getMemberFromStatus(members []*etcdserverpb.Member, endpointStatus *clientv3.StatusResponse) (*etcdserverpb.Member, error) {