Skip to content

Commit

Permalink
feat: added leader election config for non-ha sno clusters
Browse files Browse the repository at this point in the history
added functions to help default leader election config for sno clusters
updated controller builder to use sno leader election when user provdes no options and we are currently running in sno control plane topology

Signed-off-by: ehila <ehila@redhat.com>

feat: added cluster infra status check for leader election

added check for sno topology for leader election config

Signed-off-by: ehila <ehila@redhat.com>

upkeep: updated warning message for clarity

Signed-off-by: ehila <ehila@redhat.com>

docs: added calculations and context

added calculations for the sno leader election numbers
added more comments around method to provide context and reasoning

Signed-off-by: ehila <ehila@redhat.com>

feat: added LE disable check and tests

added check for LE disable flag and SingleReplica before calling SNO leader election method
added tests for LeaderElectionSNOConfig and LeaderElectionDefaulting for expected behavior

Signed-off-by: ehila <ehila@redhat.com>

test: fixed test case for defaults

Signed-off-by: ehila <ehila@redhat.com>

test: changed unit test to use reflect package

changed unit test for equaility check to use reflect.DeepEqual instead of assert.Equal to follow convention of other unit tests

Signed-off-by: ehila <ehila@redhat.com>

test: fix unit test when running in cluster

fixed unit test that reads in namespace from cluster as fallback

Signed-off-by: ehila <ehila@redhat.com>

fix: added nil check for infraStatus and error out if not populated

Signed-off-by: ehila <ehila@redhat.com>

fix: added flag to derive user intent on leader election

added a flag to signal if the user supplied any leader election config
added logic to allow us to override the leader election if the user has provided no configs
added unit tests for sno config logic method

Signed-off-by: ehila <ehila@redhat.com>

feat: removed un-needed method

removed GetClusterInfraStatusOrDie method, not needed

Signed-off-by: ehila <ehila@redhat.com>

refactor: changed variable name and logging message

updated flag variable name to be more explicit
inverted logic to match variable name
updated messaging to match for event recorder and logging statement

Signed-off-by: ehila <ehila@redhat.com>

feat: changed flag logic to match recommendation

Signed-off-by: ehila <ehila@redhat.com>
  • Loading branch information
eggfoobar committed Dec 8, 2021
1 parent 8b5f0ba commit 2612981
Show file tree
Hide file tree
Showing 5 changed files with 312 additions and 8 deletions.
12 changes: 4 additions & 8 deletions pkg/config/clusterstatus/clusterstatus.go
Expand Up @@ -2,6 +2,7 @@ package clusterstatus

import (
"context"
"fmt"

configv1 "github.com/openshift/api/config/v1"
openshiftcorev1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
Expand All @@ -21,13 +22,8 @@ func GetClusterInfraStatus(ctx context.Context, restClient *rest.Config) (*confi
if err != nil {
return nil, err
}
return &infra.Status, nil
}

func GetClusterInfraStatusOrDie(ctx context.Context, restClient *rest.Config) *configv1.InfrastructureStatus {
infra, err := GetClusterInfraStatus(ctx, restClient)
if err != nil {
panic(err)
if infra == nil {
return nil, fmt.Errorf("getting resource Infrastructure (name: %s) succeeded but object was nil", infraResourceName)
}
return infra
return &infra.Status, nil
}
28 changes: 28 additions & 0 deletions pkg/config/leaderelection/leaderelection.go
Expand Up @@ -124,3 +124,31 @@ func LeaderElectionDefaulting(config configv1.LeaderElection, defaultNamespace,
}
return ret
}

// LeaderElectionSNOConfig uses the formula derived in LeaderElectionDefaulting with increased
// retry period and lease duration for SNO clusters that have limited resources.
// This method does not respect the passed in LeaderElection config and the returned object will have values
// that are overridden with SNO environments in mind.
// This method should only be called when running in an SNO Cluster.
func LeaderElectionSNOConfig(config configv1.LeaderElection) configv1.LeaderElection {

// We want to make sure we respect a 30s clock skew as well as a 4 retry attempt with out making
// leader election ineffectual while still having some small performance gain by limiting calls against
// the api server.

// 1. clock skew tolerance is leaseDuration-renewDeadline == 30s
// 2. kube-apiserver downtime tolerance is == 180s
// lastRetry=floor(renewDeadline/retryPeriod)*retryPeriod == 240
// downtimeTolerance = lastRetry-retryPeriod == 180s
// 3. worst non-graceful lease acquisition is leaseDuration+retryPeriod == 330s
// 4. worst graceful lease acquisition is retryPeriod == 60s

ret := *(&config).DeepCopy()
// 270-240 = 30s of clock skew tolerance
ret.LeaseDuration.Duration = 270 * time.Second
// 240/60 = 4 retries attempts before leader is lost.
ret.RenewDeadline.Duration = 240 * time.Second
// With 60s retry config we aim to maintain 30s of clock skew as well as 4 retry attempts.
ret.RetryPeriod.Duration = 60 * time.Second
return ret
}
178 changes: 178 additions & 0 deletions pkg/config/leaderelection/leaderelection_test.go
@@ -0,0 +1,178 @@
package leaderelection

import (
"reflect"
"testing"
"time"

configv1 "github.com/openshift/api/config/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestLeaderElectionSNOConfig(t *testing.T) {
testCases := []struct {
desc string
inputConfig configv1.LeaderElection
expectedConfig configv1.LeaderElection
}{
{
desc: "should not alter disable flag when true",
inputConfig: configv1.LeaderElection{
Disable: true,
},
expectedConfig: configv1.LeaderElection{
Disable: true,
LeaseDuration: metav1.Duration{Duration: 270 * time.Second},
RenewDeadline: metav1.Duration{Duration: 240 * time.Second},
RetryPeriod: metav1.Duration{Duration: 60 * time.Second},
},
},
{
desc: "should not alter disable flag when false",
inputConfig: configv1.LeaderElection{
Disable: false,
},
expectedConfig: configv1.LeaderElection{
Disable: false,
LeaseDuration: metav1.Duration{Duration: 270 * time.Second},
RenewDeadline: metav1.Duration{Duration: 240 * time.Second},
RetryPeriod: metav1.Duration{Duration: 60 * time.Second},
},
},
{
desc: "should change durations when none are provided",
inputConfig: configv1.LeaderElection{},
expectedConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 270 * time.Second},
RenewDeadline: metav1.Duration{Duration: 240 * time.Second},
RetryPeriod: metav1.Duration{Duration: 60 * time.Second},
},
},
{
desc: "should change durations for sno configs",
inputConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expectedConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 270 * time.Second},
RenewDeadline: metav1.Duration{Duration: 240 * time.Second},
RetryPeriod: metav1.Duration{Duration: 60 * time.Second},
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
resultConfig := LeaderElectionSNOConfig(tc.inputConfig)
if !reflect.DeepEqual(tc.expectedConfig, resultConfig) {
t.Errorf("expected %#v, got %#v", tc.expectedConfig, resultConfig)
}
})
}
}

func TestLeaderElectionDefaulting(t *testing.T) {
testCases := []struct {
desc string
defaultNamespace string
defaultName string
inputConfig configv1.LeaderElection
expectedConfig configv1.LeaderElection
}{
{
desc: "should not alter disable flag when true",
inputConfig: configv1.LeaderElection{
Disable: true,
},
expectedConfig: configv1.LeaderElection{
Disable: true,
LeaseDuration: metav1.Duration{Duration: 137 * time.Second},
RenewDeadline: metav1.Duration{Duration: 107 * time.Second},
RetryPeriod: metav1.Duration{Duration: 26 * time.Second},
},
},
{
desc: "should not alter disable flag when false",
inputConfig: configv1.LeaderElection{
Disable: false,
},
expectedConfig: configv1.LeaderElection{
Disable: false,
LeaseDuration: metav1.Duration{Duration: 137 * time.Second},
RenewDeadline: metav1.Duration{Duration: 107 * time.Second},
RetryPeriod: metav1.Duration{Duration: 26 * time.Second},
},
},
{
desc: "should change durations when none are provided",
inputConfig: configv1.LeaderElection{},
expectedConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 137 * time.Second},
RenewDeadline: metav1.Duration{Duration: 107 * time.Second},
RetryPeriod: metav1.Duration{Duration: 26 * time.Second},
},
},
{
desc: "should use default name and namespace when none is provided",
inputConfig: configv1.LeaderElection{},
defaultName: "new-default-name",
defaultNamespace: "new-default-namespace",
expectedConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 137 * time.Second},
RenewDeadline: metav1.Duration{Duration: 107 * time.Second},
RetryPeriod: metav1.Duration{Duration: 26 * time.Second},
Name: "new-default-name",
Namespace: "new-default-namespace",
},
},
{
desc: "should not alter durations when values are provided",
inputConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expectedConfig: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
},
{
desc: "should not alter when durations, name and namespace are provided",
defaultName: "new-default-name",
defaultNamespace: "new-default-namespace",
inputConfig: configv1.LeaderElection{
Name: "original-name",
Namespace: "original-namespace",
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expectedConfig: configv1.LeaderElection{
Name: "original-name",
Namespace: "original-namespace",
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
resultConfig := LeaderElectionDefaulting(tc.inputConfig, tc.defaultNamespace, tc.defaultName)

// When testing in clusters, namespace is read from /var/run/secrets/kubernetes.io/serviceaccount/namespace in LeaderElectionDefaulting if none is provided.
// We configure expectedConfig.Namespace to equal resultConfig.Namespace if no default or input namespace is provided
// so we use the dynamic namespace read in from the environment
if tc.defaultNamespace == "" && tc.inputConfig.Namespace == "" {
tc.expectedConfig.Namespace = resultConfig.Namespace
}

if !reflect.DeepEqual(tc.expectedConfig, resultConfig) {
t.Errorf("expected %#v, got %#v", tc.expectedConfig, resultConfig)
}
})
}
}
35 changes: 35 additions & 0 deletions pkg/controller/controllercmd/builder.go
Expand Up @@ -13,6 +13,7 @@ import (
operatorv1alpha1 "github.com/openshift/api/operator/v1alpha1"
"github.com/openshift/library-go/pkg/authorization/hardcodedauthorizer"
"github.com/openshift/library-go/pkg/config/client"
"github.com/openshift/library-go/pkg/config/clusterstatus"
"github.com/openshift/library-go/pkg/config/configdefaults"
leaderelectionconverter "github.com/openshift/library-go/pkg/config/leaderelection"
"github.com/openshift/library-go/pkg/config/serving"
Expand Down Expand Up @@ -89,6 +90,10 @@ type ControllerBuilder struct {
// This stub exists for unit test where we can check if the graceful termination work properly.
// Default function will klog.Warning(args) and os.Exit(1).
nonZeroExitFn func(args ...interface{})

// Keep track if we defaulted leader election, used to make sure we don't stomp on the users intent for leader election
// We use this flag to determine at runtime if we can alter leader election for SNO configurations
userExplicitlySetLeaderElectionValues bool
}

// NewController returns a builder struct for constructing the command you want to run
Expand Down Expand Up @@ -142,6 +147,11 @@ func (b *ControllerBuilder) WithLeaderElection(leaderElection configv1.LeaderEle
return b
}

// Set flag that SNO leader election configs can be used since user provided no timing configs
b.userExplicitlySetLeaderElectionValues = leaderElection.LeaseDuration.Duration != 0 ||
leaderElection.RenewDeadline.Duration != 0 ||
leaderElection.RetryPeriod.Duration != 0

defaulted := leaderelectionconverter.LeaderElectionDefaulting(leaderElection, defaultNamespace, defaultName)
b.leaderElection = &defaulted
return b
Expand Down Expand Up @@ -304,6 +314,17 @@ func (b *ControllerBuilder) Run(ctx context.Context, config *unstructured.Unstru
return nil
}

if !b.userExplicitlySetLeaderElectionValues {
infraStatus, err := clusterstatus.GetClusterInfraStatus(ctx, clientConfig)
if err != nil || infraStatus == nil {
eventRecorder.Warningf("ClusterInfrastructureStatus", "unable to get cluster infrastructure status, using HA cluster values for leader election: %v", err)
klog.Warningf("unable to get cluster infrastructure status, using HA cluster values for leader election: %v", err)
} else {
snoLeaderElection := infraStatusTopologyLeaderElection(infraStatus, *b.leaderElection)
b.leaderElection = &snoLeaderElection
}
}

// ensure blocking TCP connections don't block the leader election
leaderConfig := rest.CopyConfig(protoConfig)
leaderConfig.Timeout = b.leaderElection.RenewDeadline.Duration
Expand Down Expand Up @@ -370,3 +391,17 @@ func (b *ControllerBuilder) getClientConfig() (*rest.Config, error) {

return client.GetKubeConfigOrInClusterConfig(kubeconfig, b.clientOverrides)
}

func infraStatusTopologyLeaderElection(infraStatus *configv1.InfrastructureStatus, original configv1.LeaderElection) configv1.LeaderElection {
// if we can't determine the infra toplogy, return original
if infraStatus == nil {
return original
}

// If we are running in a SingleReplicaTopologyMode and leader election is not disabled, configure leader election for SNO Toplogy
if infraStatus.ControlPlaneTopology == configv1.SingleReplicaTopologyMode && !original.Disable {
klog.Info("detected SingleReplicaTopologyMode, the original leader election has been altered for the default SingleReplicaToplogy")
return leaderelectionconverter.LeaderElectionSNOConfig(original)
}
return original
}
67 changes: 67 additions & 0 deletions pkg/controller/controllercmd/builder_test.go
Expand Up @@ -3,11 +3,14 @@ package controllercmd
import (
"context"
"fmt"
"reflect"
"strings"
"testing"
"time"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/operator/events/eventstesting"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestControllerBuilder_getOnStartedLeadingFunc(t *testing.T) {
Expand Down Expand Up @@ -184,3 +187,67 @@ func TestControllerBuilder_OnLeadingFunc_NonZeroExit(t *testing.T) {
t.Fatal("unexpected timeout while terminating")
}
}

func TestInfraStatusTopologyLeaderElection(t *testing.T) {
testCases := []struct {
desc string
infra *configv1.InfrastructureStatus
original configv1.LeaderElection
expected configv1.LeaderElection
}{
{
desc: "should not set SNO config when infra is nil",
infra: nil,
original: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expected: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
},
{
desc: "should not set SNO config when infra is HighlyAvailableTopologyMode",
infra: &configv1.InfrastructureStatus{
ControlPlaneTopology: configv1.HighlyAvailableTopologyMode,
},
original: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expected: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
},
{
desc: "should set SNO leader election config when SingleReplicaToplogy Controlplane",
infra: &configv1.InfrastructureStatus{
ControlPlaneTopology: configv1.SingleReplicaTopologyMode,
},
original: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
RenewDeadline: metav1.Duration{Duration: 40 * time.Second},
RetryPeriod: metav1.Duration{Duration: 20 * time.Second},
},
expected: configv1.LeaderElection{
LeaseDuration: metav1.Duration{Duration: 270 * time.Second},
RenewDeadline: metav1.Duration{Duration: 240 * time.Second},
RetryPeriod: metav1.Duration{Duration: 60 * time.Second},
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
leConfig := infraStatusTopologyLeaderElection(tc.infra, tc.original)
if !reflect.DeepEqual(tc.expected, leConfig) {
t.Errorf("expected %#v, got %#v", tc.expected, leConfig)
}
})
}
}

0 comments on commit 2612981

Please sign in to comment.