Skip to content

Commit

Permalink
pkg/operator/defragcontroller: ensure defrag has clear signal
Browse files Browse the repository at this point in the history
Signed-off-by: Sam Batschelet <sbatsche@redhat.com>
  • Loading branch information
hexfusion committed Oct 13, 2021
1 parent 10c3297 commit aa7ff87
Showing 1 changed file with 21 additions and 8 deletions.
29 changes: 21 additions & 8 deletions pkg/operator/defragcontroller/defragcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"math"
"time"

configv1 "github.com/openshift/api/config/v1"
Expand All @@ -23,12 +24,17 @@ import (

const (
minDefragBytes int64 = 100 * 1024 * 1024 // 100MB
minDefragWaitDuration = 36 * time.Second
maxFragmentedPercentage float64 = 45
waitDuration = 2 * time.Second
timeoutDuration = 30 * time.Second
pollWaitDuration = 2 * time.Second
pollTimeoutDuration = 45 * time.Second
compactionInterval = 10 * time.Minute

defragDisabledCondition = "DefragControllerDisabled"
)

// DefragController observes the operand state file for fragmentation
// DefragController observes the etcd state file fragmentation via Status method of Maintenance API. Based on these
// observations the controller will perform rolling defragmentation of each etcd member in the cluster.
type DefragController struct {
operatorClient v1helpers.OperatorClient
etcdClient etcdcli.EtcdClient
Expand All @@ -46,7 +52,7 @@ func NewDefragController(
etcdClient: etcdClient,
infrastructureLister: infrastructureLister,
}
return factory.New().ResyncEvery(9*time.Minute).WithInformers(
return factory.New().ResyncEvery(compactionInterval+1*time.Minute).WithInformers( // attempt to sync outside of etcd compaction interval to ensure maximum gain by defragmentation.
operatorClient.Informer(),
).WithSync(c.sync).ToController("DefragController", eventRecorder.WithComponentSuffix("defrag-controller"))
}
Expand Down Expand Up @@ -164,9 +170,13 @@ func (c *DefragController) checkDefrag(ctx context.Context, recorder events.Reco

// Give cluster time to recover before we move to the next member.
if err := wait.Poll(
waitDuration,
timeoutDuration,
pollWaitDuration,
pollTimeoutDuration,
func() (bool, error) {
// Ensure defragmentation attempts have clear observable signal.
klog.V(4).Infof("Sleeping to allow cluster to recover before defrag next member %v", minDefragWaitDuration)
time.Sleep(minDefragWaitDuration)

memberHealth, err := c.etcdClient.MemberHealth(ctx)
if err != nil {
klog.Warningf("failed checking member health: %v", err)
Expand Down Expand Up @@ -195,13 +205,16 @@ func isEndpointBackendFragmented(member *etcdserverpb.Member, endpointStatus *cl
return false
}
fragmentedPercentage := checkFragmentationPercentage(endpointStatus.DbSize, endpointStatus.DbSizeInUse)
klog.Infof("etcd member %q backend store fragmented: %.2f %%, dbSize: %d", member.Name, fragmentedPercentage, endpointStatus.DbSize)
if fragmentedPercentage > 0.00 {
klog.Infof("etcd member %q backend store fragmented: %.2f %%, dbSize: %d", member.Name, fragmentedPercentage, endpointStatus.DbSize)
}
return fragmentedPercentage >= maxFragmentedPercentage && endpointStatus.DbSize >= minDefragBytes
}

func checkFragmentationPercentage(ondisk, inuse int64) float64 {
diff := float64(ondisk - inuse)
return (diff / float64(ondisk)) * 100
fragmentedPercentage := (diff / float64(ondisk)) * 100
return math.Round(fragmentedPercentage*100) / 100
}

func getMemberFromStatus(members []*etcdserverpb.Member, endpointStatus *clientv3.StatusResponse) (*etcdserverpb.Member, error) {
Expand Down

0 comments on commit aa7ff87

Please sign in to comment.