Skip to content

Commit

Permalink
UPSTREAM: <carry>: skip posting failures to aggregated APIs to avoid …
Browse files Browse the repository at this point in the history
…getting false positives until the server becomes ready

the availability checks depend on fully initialized SDN
OpenShift carries a few reachability checks that affect /readyz protocol
we skip posting failures to avoid getting false positives until the server becomes ready

UPSTREAM: <carry>: skip posting failures to aggregated APIs to avoid getting false positives until the server becomes ready

marks availability of the server before checking the aggregate APIs
as it can change as we are running the checks.
in that case, skip posting failures to avoid false positives.

note on the next rebase please squash with the previous commit

UPSTREAM: <carry>: expose HasBeenReady lifecycle signal

OpenShift-Rebase-Source: 8558e88
  • Loading branch information
p0lyn0mial authored and sairameshv committed Dec 4, 2023
1 parent b94a62c commit 50db496
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 1 deletion.
5 changes: 5 additions & 0 deletions staging/src/k8s.io/apiserver/pkg/server/config.go
Expand Up @@ -680,6 +680,11 @@ func (c *Config) DrainedNotify() <-chan struct{} {
return c.lifecycleSignals.InFlightRequestsDrained.Signaled()
}

// HasBeenReadySignal exposes a server's lifecycle signal which is signaled when the readyz endpoint succeeds for the first time.
func (c *Config) HasBeenReadySignal() <-chan struct{} {
return c.lifecycleSignals.HasBeenReady.Signaled()
}

// Complete fills in any fields not set that are required to have valid data and can be derived
// from other fields. If you're going to `ApplyOptions`, do that first. It's mutating the receiver.
func (c *Config) Complete(informers informers.SharedInformerFactory) CompletedConfig {
Expand Down
Expand Up @@ -326,6 +326,7 @@ func (c completedConfig) NewWithDelegate(delegationTarget genericapiserver.Deleg
proxyTransportDial,
(func() ([]byte, []byte))(s.proxyCurrentCertKeyContent),
s.serviceResolver,
c.GenericConfig.HasBeenReadySignal(),
)
if err != nil {
return nil, err
Expand Down
Expand Up @@ -89,6 +89,9 @@ type AvailableConditionController struct {

// metrics registered into legacy registry
metrics *availabilityMetrics

// hasBeenReady is signaled when the readyz endpoint succeeds for the first time.
hasBeenReady <-chan struct{}
}

// NewAvailableConditionController returns a new AvailableConditionController.
Expand All @@ -100,6 +103,7 @@ func NewAvailableConditionController(
proxyTransportDial *transport.DialHolder,
proxyCurrentCertKeyContent certKeyFunc,
serviceResolver ServiceResolver,
hasBeenReady <-chan struct{},
) (*AvailableConditionController, error) {
c := &AvailableConditionController{
apiServiceClient: apiServiceClient,
Expand All @@ -116,6 +120,7 @@ func NewAvailableConditionController(
proxyTransportDial: proxyTransportDial,
proxyCurrentCertKeyContent: proxyCurrentCertKeyContent,
metrics: newAvailabilityMetrics(),
hasBeenReady: hasBeenReady,
}

// resync on this one because it is low cardinality and rechecking the actual discovery
Expand Down Expand Up @@ -169,6 +174,18 @@ func (c *AvailableConditionController) sync(key string) error {
return err
}

// the availability checks depend on fully initialized SDN
// OpenShift carries a few reachability checks that affect /readyz protocol
// record availability of the server so that we can
// skip posting failures to avoid getting false positives until the server becomes ready
hasBeenReady := false
select {
case <-c.hasBeenReady:
hasBeenReady = true
default:
// continue, we will skip posting only potential failures
}

// if a particular transport was specified, use that otherwise build one
// construct an http client that will ignore TLS verification (if someone owns the network and messes with your status
// that's not so bad) and sets a very short timeout. This is a best effort GET that provides no additional information
Expand Down Expand Up @@ -359,6 +376,11 @@ func (c *AvailableConditionController) sync(key string) error {
}

if lastError != nil {
if !hasBeenReady {
// returning an error will requeue the item in an exponential fashion
return fmt.Errorf("the server hasn't been ready yet, skipping updating availability of the aggreaged API until the server becomes ready to avoid false positives, lastError = %v", lastError)
}

availableCondition.Status = apiregistrationv1.ConditionFalse
availableCondition.Reason = "FailedDiscoveryCheck"
availableCondition.Message = lastError.Error()
Expand Down
Expand Up @@ -119,6 +119,8 @@ func setupAPIServices(apiServices []*apiregistration.APIService) (*AvailableCond
for _, o := range apiServices {
apiServiceIndexer.Add(o)
}
alwaysReadyChan := make(chan struct{})
close(alwaysReadyChan)

c := AvailableConditionController{
apiServiceClient: fakeClient.ApiregistrationV1(),
Expand All @@ -132,7 +134,8 @@ func setupAPIServices(apiServices []*apiregistration.APIService) (*AvailableCond
// the maximum disruption time to a minimum, but it does prevent hot loops.
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 30*time.Second),
"AvailableConditionController"),
metrics: newAvailabilityMetrics(),
metrics: newAvailabilityMetrics(),
hasBeenReady: alwaysReadyChan,
}
for _, svc := range apiServices {
c.addAPIService(svc)
Expand Down Expand Up @@ -386,6 +389,8 @@ func TestSync(t *testing.T) {
w.WriteHeader(tc.backendStatus)
}))
defer testServer.Close()
alwaysReadyChan := make(chan struct{})
close(alwaysReadyChan)

c := AvailableConditionController{
apiServiceClient: fakeClient.ApiregistrationV1(),
Expand All @@ -395,6 +400,7 @@ func TestSync(t *testing.T) {
serviceResolver: &fakeServiceResolver{url: testServer.URL},
proxyCurrentCertKeyContent: func() ([]byte, []byte) { return emptyCert(), emptyCert() },
metrics: newAvailabilityMetrics(),
hasBeenReady: alwaysReadyChan,
}
c.sync(tc.apiServiceName)

Expand Down

0 comments on commit 50db496

Please sign in to comment.