Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MCO: clear out failing status on success and add tests #442

Merged
merged 4 commits into from
Feb 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ required = [
go-tests = false
unused-packages = false

[[constraint]]
name = "github.com/stretchr/testify"
version = "v1.3.0"

[[constraint]]
name = "github.com/apparentlymart/go-cidr"
version = "1.0.0"
Expand Down
45 changes: 41 additions & 4 deletions cmd/common/client_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"os"

"github.com/golang/glog"
configv1 "github.com/openshift/api/config/v1"
configclientset "github.com/openshift/client-go/config/clientset/versioned"
apiext "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
Expand All @@ -19,23 +21,58 @@ type ClientBuilder struct {
config *rest.Config
}

// ClientOrDie returns the kubernetes client interface for machine config.
// MachineConfigClientOrDie returns the kubernetes client interface for machine config.
func (cb *ClientBuilder) MachineConfigClientOrDie(name string) mcfgclientset.Interface {
return mcfgclientset.NewForConfigOrDie(rest.AddUserAgent(cb.config, name))
}

// ClientOrDie returns the kubernetes client interface for general kubernetes objects.
// KubeClientOrDie returns the kubernetes client interface for general kubernetes objects.
func (cb *ClientBuilder) KubeClientOrDie(name string) kubernetes.Interface {
return kubernetes.NewForConfigOrDie(rest.AddUserAgent(cb.config, name))
}

// ClientOrDie returns the kubernetes client interface for security related kubernetes objects
// clusterOperatorsClient is a wrapper around the ClusterOperators client
// implementing the ClusterOperatorsClientInterface
type clusterOperatorsClient struct {
innerConfigClient configclientset.Interface
}

// Create creates the cluster operator and returns it
func (c *clusterOperatorsClient) Create(co *configv1.ClusterOperator) (*configv1.ClusterOperator, error) {
return c.innerConfigClient.ConfigV1().ClusterOperators().Create(co)
}

// UpdateStatus updates the status of the cluster operator and returns it
func (c *clusterOperatorsClient) UpdateStatus(co *configv1.ClusterOperator) (*configv1.ClusterOperator, error) {
return c.innerConfigClient.ConfigV1().ClusterOperators().UpdateStatus(co)
}

// Get returns the cluster operator by name
func (c *clusterOperatorsClient) Get(name string, options metav1.GetOptions) (*configv1.ClusterOperator, error) {
return c.innerConfigClient.ConfigV1().ClusterOperators().Get(name, options)
}

// ClusterOperatorsClientInterface is a controlled ClusterOperators client which is used
// by the operator and allows us to mock it in tests.
type ClusterOperatorsClientInterface interface {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is important for us to be able to control what we use in the MCO itself and allow us to create meaningful mocks for proper unit tests.

Create(*configv1.ClusterOperator) (*configv1.ClusterOperator, error)
UpdateStatus(*configv1.ClusterOperator) (*configv1.ClusterOperator, error)
Get(name string, options metav1.GetOptions) (*configv1.ClusterOperator, error)
}

// ClusterOperatorsClientOrDie returns the controlleed ClusterOperators client used by the operator
func (cb *ClientBuilder) ClusterOperatorsClientOrDie(name string) ClusterOperatorsClientInterface {
cc := configclientset.NewForConfigOrDie(rest.AddUserAgent(cb.config, name))
return &clusterOperatorsClient{innerConfigClient: cc}
}

// ConfigClientOrDie returns the kubernetes client interface for security related kubernetes objects
// such as pod security policy, security context.
func (cb *ClientBuilder) ConfigClientOrDie(name string) configclientset.Interface {
return configclientset.NewForConfigOrDie(rest.AddUserAgent(cb.config, name))
}

// ClientOrDie returns the kubernetes client interface for extended kubernetes objects.
// APIExtClientOrDie returns the kubernetes client interface for extended kubernetes objects.
func (cb *ClientBuilder) APIExtClientOrDie(name string) apiext.Interface {
return apiext.NewForConfigOrDie(rest.AddUserAgent(cb.config, name))
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/machine-config-operator/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func startControllers(ctx *common.ControllerContext) error {
ctx.ClientBuilder.MachineConfigClientOrDie(componentName),
ctx.ClientBuilder.KubeClientOrDie(componentName),
ctx.ClientBuilder.APIExtClientOrDie(componentName),
ctx.ClientBuilder.ConfigClientOrDie(componentName),
ctx.ClientBuilder.ClusterOperatorsClientOrDie(componentName),
).Run(2, ctx.Stop)

return nil
Expand Down
18 changes: 14 additions & 4 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@ import (
"k8s.io/client-go/util/workqueue"

configv1 "github.com/openshift/api/config/v1"
configclientset "github.com/openshift/client-go/config/clientset/versioned"
configinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
installertypes "github.com/openshift/installer/pkg/types"

// TODO(runcom): move to pkg
"github.com/openshift/machine-config-operator/cmd/common"
mcfgv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
templatectrl "github.com/openshift/machine-config-operator/pkg/controller/template"
mcfgclientset "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned"
Expand Down Expand Up @@ -68,7 +69,7 @@ type Operator struct {
client mcfgclientset.Interface
kubeClient kubernetes.Interface
apiExtClient apiextclientset.Interface
configClient configclientset.Interface
configClient common.ClusterOperatorsClientInterface
eventRecorder record.EventRecorder

syncHandler func(ic string) error
Expand Down Expand Up @@ -115,7 +116,7 @@ func New(
client mcfgclientset.Interface,
kubeClient kubernetes.Interface,
apiExtClient apiextclientset.Interface,
configClient configclientset.Interface,
configClient common.ClusterOperatorsClientInterface,
) *Operator {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartLogging(glog.Infof)
Expand Down Expand Up @@ -329,7 +330,16 @@ func (optr *Operator) sync(key string) error {

// create renderConfig
rc := getRenderConfig(namespace, spec, imgs, infra.Status.APIServerURL)
return optr.syncAll(rc)
// syncFuncs is the list of sync functions that are executed in order.
// any error marks sync as failure but continues to next syncFunc
var syncFuncs = []syncFunc{
{"pools", optr.syncMachineConfigPools},
{"mcc", optr.syncMachineConfigController},
{"mcs", optr.syncMachineConfigServer},
{"mcd", optr.syncMachineConfigDaemon},
{"required-pools", optr.syncRequiredMachineConfigPools},
}
return optr.syncAll(rc, syncFuncs)
}

func (optr *Operator) getOsImageURL(namespace string) (string, error) {
Expand Down
55 changes: 29 additions & 26 deletions pkg/operator/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ func (optr *Operator) syncAvailableStatus() error {
}

optrVersion, _ := optr.vStore.Get("operator")
progressing := cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorProgressing)
failing := cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorFailing)
message := fmt.Sprintf("Cluster has deployed %s", optrVersion)

available := configv1.ConditionTrue

if failing && !progressing {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, so failing now always means !available? OK, that looks like what the CVO does as well I think.

Copy link
Member Author

@runcom runcom Feb 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my understanding for the MCO is that if we fail the sync during a progressing, we could have e.g. applied a new MCO or something else that can likely misbehave if at some point we failed and thus no reason to report available. @abhinavdahiya what do you think?

Copy link
Member Author

@runcom runcom Feb 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, when looking at https://github.com/openshift/cluster-version-operator/blob/master/docs/dev/clusteroperator.md#conditions, there's an example:

If an error blocks reaching 4.0.1, the conditions might be:

Failing is true with a detailed message Unable to apply 4.0.1: could not update 0000_70_network_deployment.yaml because the resource type NetworkConfig has not been installed on the server.
Available is true with message Cluster has deployed 4.0.0
Progressing is true with message Unable to apply 4.0.1: a required object is missing

I'm not sure how to read that, if any of the syncFuncs here fails during a progressing, the status of the MCO isn't really available cause we may have e.g. started rolling a new mcc but the mcd broke and makes little sense to report Available.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, maybe that !progressing is still right because the bug was really that we weren't clearing failed on successful sync

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This patch accounts for that case though and the test changed accounts for my bad assumption (all other tests are fine anyway with this):

diff --git a/pkg/operator/status.go b/pkg/operator/status.go
index b71370e..3ce2262 100644
--- a/pkg/operator/status.go
+++ b/pkg/operator/status.go
@@ -29,11 +29,12 @@ func (optr *Operator) syncAvailableStatus() error {
 
 	optrVersion, _ := optr.vStore.Get("operator")
 	failing := cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorFailing)
+	progressing := cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorProgressing)
 	message := fmt.Sprintf("Cluster has deployed %s", optrVersion)
 
 	available := configv1.ConditionTrue
 
-	if failing {
+	if (failing && !progressing) || (failing && optr.inClusterBringup) {
 		available = configv1.ConditionFalse
 		message = fmt.Sprintf("Cluster not available for %s", optrVersion)
 	}
diff --git a/pkg/operator/status_test.go b/pkg/operator/status_test.go
index 1437769..9b24c97 100644
--- a/pkg/operator/status_test.go
+++ b/pkg/operator/status_test.go
@@ -350,8 +350,7 @@ func TestOperatorSyncStatus(t *testing.T) {
 				},
 			},
 		},
-		// 3. test that if progressing fails, we report available=false because state of the operator
-		//    might have changed in the various sync calls
+		// 3. test that if progressing fails, we report available=true for the current version
 		{
 			syncs: []syncCase{
 				{
@@ -390,7 +389,7 @@ func TestOperatorSyncStatus(t *testing.T) {
 						},
 						{
 							Type:   configv1.OperatorAvailable,
-							Status: configv1.ConditionFalse,
+							Status: configv1.ConditionTrue,
 						},
 						{
 							Type:   configv1.OperatorFailing,
@@ -405,6 +404,29 @@ func TestOperatorSyncStatus(t *testing.T) {
 						},
 					},
 				},
+				{
+					// we mock the fact that we are at operator=test-version-2 after the previous sync
+					cond: []configv1.ClusterOperatorStatusCondition{
+						{
+							Type:   configv1.OperatorProgressing,
+							Status: configv1.ConditionFalse,
+						},
+						{
+							Type:   configv1.OperatorAvailable,
+							Status: configv1.ConditionTrue,
+						},
+						{
+							Type:   configv1.OperatorFailing,
+							Status: configv1.ConditionFalse,
+						},
+					},
+					syncFuncs: []syncFunc{
+						{
+							name: "fn1",
+							fn:   func(config renderConfig) error { return nil },
+						},
+					},
+				},
 			},
 		},
 		// 4. test that if progressing fails during bringup, we still report failing and not available
@@ -601,4 +623,4 @@ func TestInClusterBringUpStayOnErr(t *testing.T) {
 	assert.Nil(t, err, "expected syncAll to pass")
 
 	assert.False(t, optr.inClusterBringup)
-}
\ No newline at end of file
+}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the patch above effectively enables the mco to report available=true, progressing=true, failing=true if during a progressing we get a fail, but the mco is still available (assumption from cvo doc)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

opened #450 to further discuss this.

if failing {
available = configv1.ConditionFalse
message = fmt.Sprintf("Cluster not available for %s", optrVersion)
}
Expand All @@ -47,7 +46,7 @@ func (optr *Operator) syncAvailableStatus() error {

co.Status.Versions = optr.vStore.GetAll()
optr.setMachineConfigPoolStatuses(&co.Status)
_, err = optr.configClient.ConfigV1().ClusterOperators().UpdateStatus(co)
_, err = optr.configClient.UpdateStatus(co)
return err
}

Expand All @@ -67,6 +66,7 @@ func (optr *Operator) syncProgressingStatus() error {

if optr.vStore.Equal(co.Status.Versions) {
if optr.inClusterBringup {
message = fmt.Sprintf("Cluster is bootstrapping %s", optrVersion)
progressing = configv1.ConditionTrue
}
} else {
Expand All @@ -80,15 +80,12 @@ func (optr *Operator) syncProgressingStatus() error {
})

optr.setMachineConfigPoolStatuses(&co.Status)
_, err = optr.configClient.ConfigV1().ClusterOperators().UpdateStatus(co)
_, err = optr.configClient.UpdateStatus(co)
return err
}

// syncFailingStatus applies the new condition to the mco's ClusterOperator object.
func (optr *Operator) syncFailingStatus(ierr error) error {
if ierr == nil {
return nil
}
func (optr *Operator) syncFailingStatus(ierr error) (err error) {
co, err := optr.fetchClusterOperator()
if err != nil {
return err
Expand All @@ -98,34 +95,40 @@ func (optr *Operator) syncFailingStatus(ierr error) error {
}

optrVersion, _ := optr.vStore.Get("operator")
var message string
if optr.vStore.Equal(co.Status.Versions) {
// syncing the state to exiting version.
message = fmt.Sprintf("Failed to resync %s because: %v", optrVersion, ierr.Error())
failing := configv1.ConditionTrue
var message, reason string
if ierr == nil {
failing = configv1.ConditionFalse
} else {
message = fmt.Sprintf("Unable to apply %s: %v", optrVersion, ierr.Error())
if optr.vStore.Equal(co.Status.Versions) {
// syncing the state to exiting version.
message = fmt.Sprintf("Failed to resync %s because: %v", optrVersion, ierr.Error())
} else {
message = fmt.Sprintf("Unable to apply %s: %v", optrVersion, ierr.Error())
}
reason = ierr.Error()

// set progressing
if cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorProgressing) {
cov1helpers.SetStatusCondition(&co.Status.Conditions, configv1.ClusterOperatorStatusCondition{Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue, Message: fmt.Sprintf("Unable to apply %s", version.Version.String())})
} else {
cov1helpers.SetStatusCondition(&co.Status.Conditions, configv1.ClusterOperatorStatusCondition{Type: configv1.OperatorProgressing, Status: configv1.ConditionFalse, Message: fmt.Sprintf("Error while reconciling %s", version.Version.String())})
}
}
// set failing condition
cov1helpers.SetStatusCondition(&co.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorFailing, Status: configv1.ConditionTrue,
Type: configv1.OperatorFailing, Status: failing,
Message: message,
Reason: ierr.Error(),
Reason: reason,
})

// set progressing
if cov1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorProgressing) {
cov1helpers.SetStatusCondition(&co.Status.Conditions, configv1.ClusterOperatorStatusCondition{Type: configv1.OperatorProgressing, Status: configv1.ConditionTrue, Message: fmt.Sprintf("Unable to apply %s", version.Version.String())})
} else {
cov1helpers.SetStatusCondition(&co.Status.Conditions, configv1.ClusterOperatorStatusCondition{Type: configv1.OperatorProgressing, Status: configv1.ConditionFalse, Message: fmt.Sprintf("Error while reconciling %s", version.Version.String())})
}

optr.setMachineConfigPoolStatuses(&co.Status)
_, err = optr.configClient.ConfigV1().ClusterOperators().UpdateStatus(co)
_, err = optr.configClient.UpdateStatus(co)
return err
}

func (optr *Operator) fetchClusterOperator() (*configv1.ClusterOperator, error) {
co, err := optr.configClient.ConfigV1().ClusterOperators().Get(optr.name, metav1.GetOptions{})
co, err := optr.configClient.Get(optr.name, metav1.GetOptions{})
if meta.IsNoMatchError(err) {
return nil, nil
}
Expand All @@ -139,7 +142,7 @@ func (optr *Operator) fetchClusterOperator() (*configv1.ClusterOperator, error)
}

func (optr *Operator) initializeClusterOperator() (*configv1.ClusterOperator, error) {
co, err := optr.configClient.ConfigV1().ClusterOperators().Create(&configv1.ClusterOperator{
co, err := optr.configClient.Create(&configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: optr.name,
},
Expand All @@ -154,7 +157,7 @@ func (optr *Operator) initializeClusterOperator() (*configv1.ClusterOperator, er
co.Status.RelatedObjects = []configv1.ObjectReference{
{Resource: "namespaces", Name: "openshift-machine-config-operator"},
}
return optr.configClient.ConfigV1().ClusterOperators().UpdateStatus(co)
return optr.configClient.UpdateStatus(co)
}

func (optr *Operator) setMachineConfigPoolStatuses(status *configv1.ClusterOperatorStatus) {
Expand Down
Loading