Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
prevent Available=true until all kube-apiservers have restarted
  • Loading branch information
deads2k committed Aug 6, 2020
1 parent 88367f3 commit a08be23
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
68 changes: 68 additions & 0 deletions pkg/controllers/readiness/unsupported_override.go
@@ -0,0 +1,68 @@
package readiness

import (
"bytes"
"encoding/json"
"strconv"

utilruntime "k8s.io/apimachinery/pkg/util/runtime"

operatorv1 "github.com/openshift/api/operator/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
kyaml "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/klog"
)

// isUnsupportedUnsafeAuthentication returns true if
// useUnsupportedUnsafeNonHANonProductionUnstableOAuthServer key is set
// to any parsable true value
func isUnsupportedUnsafeAuthentication(spec *operatorv1.OperatorSpec) (bool, error) {
unsupportedConfig := map[string]interface{}{}
if spec.UnsupportedConfigOverrides.Raw == nil {
return false, nil
}

configJson, err := kyaml.ToJSON(spec.UnsupportedConfigOverrides.Raw)
if err != nil {
klog.Warning(err)
// maybe it's just json
configJson = spec.UnsupportedConfigOverrides.Raw
}

if err := json.NewDecoder(bytes.NewBuffer(configJson)).Decode(&unsupportedConfig); err != nil {
klog.V(4).Infof("decode of unsupported config failed with error: %v", err)
return false, err
}

// 1. this violates operational best practices for authentication - unstable
// 2. this allows configuration to vary between kube-apiservers - unsafe and non-HA, non-production
// 3. the combination of all these things makes the situation unsupportable.
value, found, err := unstructured.NestedFieldNoCopy(unsupportedConfig, "useUnsupportedUnsafeNonHANonProductionUnstableOAuthServer")
if err != nil {
return false, err
}
if !found {
return false, nil
}
switch value.(type) {
case bool:
return value.(bool), nil
case string:
return strconv.ParseBool(value.(string))
default:
return false, nil
}
}

func getExpectedMinimumNumberOfMasters(spec *operatorv1.OperatorSpec) int {
allowAnyNumber, err := isUnsupportedUnsafeAuthentication(spec)
switch {
case err != nil:
utilruntime.HandleError(err)
return 3
case allowAnyNumber:
return 1
default:
return 3
}
}
18 changes: 16 additions & 2 deletions pkg/controllers/readiness/wellknown_ready_controller.go
Expand Up @@ -96,7 +96,11 @@ func (c *wellKnownReadyController) sync(ctx context.Context, controllerContext f

if authConfig != nil && route != nil {
// TODO: refactor this to return conditions
ready, conditionMessage, err := c.isWellknownEndpointsReady(authConfig, route)
spec, _, _, err := c.operatorClient.GetOperatorState()
if err != nil {
return err
}
ready, conditionMessage, err := c.isWellknownEndpointsReady(spec, authConfig, route)
if !ready {
if len(conditionMessage) == 0 && err != nil {
conditionMessage = err.Error()
Expand All @@ -121,7 +125,7 @@ func (c *wellKnownReadyController) sync(ctx context.Context, controllerContext f
return common.UpdateControllerConditions(c.operatorClient, knownConditionNames, foundConditions)
}

func (c *wellKnownReadyController) isWellknownEndpointsReady(authConfig *configv1.Authentication, route *routev1.Route) (bool, string, error) {
func (c *wellKnownReadyController) isWellknownEndpointsReady(spec *operatorv1.OperatorSpec, authConfig *configv1.Authentication, route *routev1.Route) (bool, string, error) {
// don't perform this check when OAuthMetadata reference is set up
// leave those cases to KAS-o which handles these cases
if userMetadataConfig := authConfig.Spec.OAuthMetadata.Name; authConfig.Spec.Type != configv1.AuthenticationTypeIntegratedOAuth || len(userMetadataConfig) != 0 {
Expand Down Expand Up @@ -151,6 +155,16 @@ func (c *wellKnownReadyController) isWellknownEndpointsReady(authConfig *configv
}
}

// if we don't have the min number of masters, this is actually ok, however Clayton has draw a hardline on starting tests as soon as all operators are Available=true
// while ignoring progressing=false. This means that even though no external observer will see a invalid .well-known information,
// the tests end up failing when their long lived connections are terminated. Killing long lived connections is normal and
// acceptable for the kube-apiserver to do during a rollout. However, because we are not allowed to merge code that ensures
// a stable kube-apiserver and because rewriting client tests like e2e-cmd is impractical, we are left trying to enforce
// this by delaying our availability because it's a backdoor into slowing down the test suite start time to gain stability.
if expectedMinNumber := getExpectedMinimumNumberOfMasters(spec); len(ips) < expectedMinNumber {
return false, "", fmt.Errorf("need at least %d kube-apiservers, got %d", expectedMinNumber, len(ips))
}

return true, "", nil
}

Expand Down

0 comments on commit a08be23

Please sign in to comment.