From 6f4e5c7e7e4afbc5aeb7842c6a58206e51204bfa Mon Sep 17 00:00:00 2001 From: Ryota Arai Date: Sun, 27 Nov 2022 22:04:50 +0900 Subject: [PATCH 1/4] NodeOperation waits the Node to be remediated after Job completed. --- api/v1alpha1/nodeoperation_types.go | 9 ++++++ api/v1alpha1/noderemediation_types.go | 28 +++++++++++++++++ controllers/nodeoperation_controller.go | 19 ++++++++++-- controllers/noderemediation_controller.go | 38 ++++++++++------------- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/api/v1alpha1/nodeoperation_types.go b/api/v1alpha1/nodeoperation_types.go index 6f33a89..ca8f7b3 100644 --- a/api/v1alpha1/nodeoperation_types.go +++ b/api/v1alpha1/nodeoperation_types.go @@ -106,6 +106,15 @@ type NodeOperationList struct { Items []NodeOperation `json:"items"` } +func (o *NodeOperation) NodeRemediationName() string { + for _, owner := range o.OwnerReferences { + if owner.APIVersion == GroupVersion.String() && owner.Kind == "NodeRemediation" { + return owner.Name + } + } + return "" +} + func init() { SchemeBuilder.Register(&NodeOperation{}, &NodeOperationList{}) } diff --git a/api/v1alpha1/noderemediation_types.go b/api/v1alpha1/noderemediation_types.go index 4fded16..685f629 100644 --- a/api/v1alpha1/noderemediation_types.go +++ b/api/v1alpha1/noderemediation_types.go @@ -47,6 +47,14 @@ type NodeConditionMatcher struct { Status corev1.ConditionStatus `json:"status"` } +type NodeStatus string + +const ( + NodeStatusUnknown NodeStatus = "" + NodeStatusOK NodeStatus = "OK" + NodeStatusBad NodeStatus = "Bad" +) + // NodeRemediationStatus defines the observed state of NodeRemediation type NodeRemediationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster @@ -54,6 +62,8 @@ type NodeRemediationStatus struct { ActiveNodeOperation corev1.ObjectReference `json:"activeNodeOperation,omitempty"` // OperationsCount is num of NodeOperations executed by the NodeRemediation. Once the Node is remediated, this count will be reset to 0. OperationsCount int64 `json:"operationsCount"` + // NodeStatus represents whether Node should be remediated or not. + NodeStatus NodeStatus `json:"nodeStatus"` } //+kubebuilder:object:root=true @@ -81,3 +91,21 @@ type NodeRemediationList struct { func init() { SchemeBuilder.Register(&NodeRemediation{}, &NodeRemediationList{}) } + +func (r *NodeRemediation) CompareNodeCondition(conditions []corev1.NodeCondition) NodeStatus { +matchersLoop: + for _, matcher := range r.Spec.Rule.Conditions { + for _, cond := range conditions { + if cond.Type == matcher.Type { + switch cond.Status { + case matcher.Status: + continue matchersLoop + case corev1.ConditionUnknown: + return NodeStatusUnknown + } + } + } + return NodeStatusOK + } + return NodeStatusBad +} diff --git a/controllers/nodeoperation_controller.go b/controllers/nodeoperation_controller.go index 0b2610d..38d99e7 100644 --- a/controllers/nodeoperation_controller.go +++ b/controllers/nodeoperation_controller.go @@ -359,17 +359,32 @@ func (r *NodeOperationReconciler) reconcileRunning(ctx context.Context, nodeOp * nodeOp.Status.Phase = nodeopsv1alpha1.NodeOperationPhaseFailed } else { _, condition := isJobFinished(&job) + switch condition { - case "": // ongoing + case "": return ctrl.Result{}, nil case batchv1.JobFailed: + r.eventRecorder.Eventf(nodeOp, "Normal", "JobFinished", `Job "%s" in "%s" has failed`, job.Name, job.Namespace) nodeOp.Status.Reason = "Job has failed" nodeOp.Status.Phase = nodeopsv1alpha1.NodeOperationPhaseFailed case batchv1.JobComplete: + r.eventRecorder.Eventf(nodeOp, "Normal", "JobFinished", `Job "%s" in "%s" has completed`, job.Name, job.Namespace) + + if remediationName := nodeOp.NodeRemediationName(); remediationName != "" { + var remediation nodeopsv1alpha1.NodeRemediation + if err := r.Get(ctx, client.ObjectKey{Namespace: nodeOp.Namespace, Name: remediationName}, &remediation); err != nil { + return ctrl.Result{}, err + } + + if remediation.Status.NodeStatus != nodeopsv1alpha1.NodeStatusOK { + r.eventRecorder.Eventf(nodeOp, corev1.EventTypeNormal, "JobCompletedButNotRemediated", `Job "%s" in "%s" has completed but the Node is not remediated yet.`, job.Name, job.Namespace) + return ctrl.Result{}, nil + } + } + nodeOp.Status.Reason = "Job has completed" nodeOp.Status.Phase = nodeopsv1alpha1.NodeOperationPhaseCompleted } - r.eventRecorder.Eventf(nodeOp, "Normal", "JobFinished", `Job "%s" in "%s" has finished`, job.Name, job.Namespace) } // untaint the Node diff --git a/controllers/noderemediation_controller.go b/controllers/noderemediation_controller.go index 3540a3b..31eb3c6 100644 --- a/controllers/noderemediation_controller.go +++ b/controllers/noderemediation_controller.go @@ -72,6 +72,19 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } + var node corev1.Node + if err := r.Get(ctx, types.NamespacedName{Name: remediation.Spec.NodeName}, &node); err != nil { + return ctrl.Result{}, err + } + + nodeStatus := remediation.CompareNodeCondition(node.Status.Conditions) + if nodeStatus != remediation.Status.NodeStatus { + remediation.Status.NodeStatus = nodeStatus + if err := r.Status().Update(ctx, &remediation); err != nil { + return ctrl.Result{}, err + } + } + var childOps nodeopsv1alpha1.NodeOperationList if err := r.List(ctx, &childOps, client.MatchingFields{operationRemediationOwnerKey: remediation.Name}); err != nil { return ctrl.Result{}, err @@ -108,12 +121,10 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ } // Check node condition - var node corev1.Node - if err := r.Get(ctx, types.NamespacedName{Name: remediation.Spec.NodeName}, &node); err != nil { - return ctrl.Result{}, err - } - - if !doesMatchConditions(node.Status.Conditions, remediation.Spec.Rule.Conditions) { + switch remediation.Status.NodeStatus { + case nodeopsv1alpha1.NodeStatusUnknown: + return ctrl.Result{}, nil + case nodeopsv1alpha1.NodeStatusOK: // reset OperationsCount remediation.Status.OperationsCount = 0 if err := r.Status().Update(ctx, &remediation); err != nil { @@ -225,18 +236,3 @@ func (r *NodeRemediationReconciler) SetupWithManager(mgr ctrl.Manager) error { Watches(&source.Kind{Type: &corev1.Node{}}, handler.EnqueueRequestsFromMapFunc(nodeMapFn)). Complete(r) } - -func doesMatchConditions(conditions []corev1.NodeCondition, matchers []nodeopsv1alpha1.NodeConditionMatcher) bool { - for _, matcher := range matchers { - ok := false - for _, cond := range conditions { - if cond.Type == matcher.Type && cond.Status == matcher.Status { - ok = true - } - } - if !ok { - return false - } - } - return true -} From 1efdda867dc1091c25e897646184753a2d29daaa Mon Sep 17 00:00:00 2001 From: Ryota Arai Date: Thu, 15 Dec 2022 08:47:17 +0900 Subject: [PATCH 2/4] Eliminate operationCount of node remediations. --- api/v1alpha1/noderemediation_types.go | 2 -- ...nodeops.k8s.preferred.jp_noderemediations.yaml | 12 +++++------- controllers/noderemediation_controller.go | 15 +-------------- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/api/v1alpha1/noderemediation_types.go b/api/v1alpha1/noderemediation_types.go index 685f629..c84d5f2 100644 --- a/api/v1alpha1/noderemediation_types.go +++ b/api/v1alpha1/noderemediation_types.go @@ -60,8 +60,6 @@ type NodeRemediationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file ActiveNodeOperation corev1.ObjectReference `json:"activeNodeOperation,omitempty"` - // OperationsCount is num of NodeOperations executed by the NodeRemediation. Once the Node is remediated, this count will be reset to 0. - OperationsCount int64 `json:"operationsCount"` // NodeStatus represents whether Node should be remediated or not. NodeStatus NodeStatus `json:"nodeStatus"` } diff --git a/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml b/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml index d62a3a0..2644893 100644 --- a/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml +++ b/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml @@ -103,14 +103,12 @@ spec: type: string type: object x-kubernetes-map-type: atomic - operationsCount: - description: OperationsCount is num of NodeOperations executed by - the NodeRemediation. Once the Node is remediated, this count will - be reset to 0. - format: int64 - type: integer + nodeStatus: + description: NodeStatus represents whether Node should be remediated + or not. + type: string required: - - operationsCount + - nodeStatus type: object type: object served: true diff --git a/controllers/noderemediation_controller.go b/controllers/noderemediation_controller.go index 31eb3c6..3eadd04 100644 --- a/controllers/noderemediation_controller.go +++ b/controllers/noderemediation_controller.go @@ -123,21 +123,9 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Check node condition switch remediation.Status.NodeStatus { case nodeopsv1alpha1.NodeStatusUnknown: + r.eventRecorder.Eventf(&remediation, corev1.EventTypeNormal, "UnknownNodeStatus", "Because at least one Node condition is unknown status, remediation process is skipped") return ctrl.Result{}, nil case nodeopsv1alpha1.NodeStatusOK: - // reset OperationsCount - remediation.Status.OperationsCount = 0 - if err := r.Status().Update(ctx, &remediation); err != nil { - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil - } - - // Avoid to create too many NodeOperations - if 0 < remediation.Status.OperationsCount { - // TODO: backoff feature. We can calculate the next backoff-ed trial timestamp from the counter value and the latest child NodeOperation completion timestamp - r.eventRecorder.Eventf(&remediation, corev1.EventTypeNormal, "NodeIsNotRemediated", `Though a NodeOperation has finished, the Node is not remediated. Skipping to create a NodeOperation.`) return ctrl.Result{}, nil } @@ -177,7 +165,6 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } remediation.Status.ActiveNodeOperation = *ref - remediation.Status.OperationsCount++ if err := r.Status().Update(ctx, &remediation); err != nil { return ctrl.Result{}, err } From 62358c8ae15c7ee07175d6b22b81f1226c63bd5e Mon Sep 17 00:00:00 2001 From: Ryota Arai Date: Thu, 15 Dec 2022 08:47:49 +0900 Subject: [PATCH 3/4] Add a test for the remediation controller. --- controllers/suite_test.go | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index ae96ce6..104f2ca 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -665,7 +665,7 @@ var _ = Describe("NodeRemediation", func() { node.Status.Conditions = append(node.Status.Conditions, corev1.NodeCondition{ Type: "TestRemediation", - Status: corev1.ConditionTrue, + Status: corev1.ConditionUnknown, Reason: "testing", Message: "testing", LastHeartbeatTime: metav1.NewTime(time.Now()), @@ -673,6 +673,25 @@ var _ = Describe("NodeRemediation", func() { }) Expect(k8sClient.Status().Update(ctx, &node)).NotTo(HaveOccurred()) + Eventually(func() bool { + events := &corev1.EventList{} + Expect(k8sClient.List(ctx, events)).NotTo(HaveOccurred()) + + for _, event := range events.Items { + if event.InvolvedObject.GroupVersionKind().String() == "nodeops.k8s.preferred.jp/v1alpha1, Kind=NodeRemediation" && + event.InvolvedObject.Name == remediation.Name && + event.Type == corev1.EventTypeNormal && + event.Reason == "UnknownNodeStatus" { + return true + } + } + return false + }, eventuallyTimeout).Should(BeTrue()) + + node.Status.Conditions[len(node.Status.Conditions)-1].Status = corev1.ConditionTrue + Expect(k8sClient.Status().Update(ctx, &node)).NotTo(HaveOccurred()) + + var nodeOp *nodeopsv1alpha1.NodeOperation Eventually(func() bool { nodeOpList := nodeopsv1alpha1.NodeOperationList{} Expect(k8sClient.List(ctx, &nodeOpList)).NotTo(HaveOccurred()) @@ -680,6 +699,7 @@ var _ = Describe("NodeRemediation", func() { for _, op := range nodeOpList.Items { for _, owner := range op.OwnerReferences { if owner.Kind == "NodeRemediation" && owner.Name == remediation.Name { + nodeOp = &op return true } } @@ -693,10 +713,12 @@ var _ = Describe("NodeRemediation", func() { Expect(k8sClient.List(ctx, events)).NotTo(HaveOccurred()) for _, event := range events.Items { - if event.InvolvedObject.GroupVersionKind().String() == "nodeops.k8s.preferred.jp/v1alpha1, Kind=NodeRemediation" && - event.InvolvedObject.Name == remediation.Name && + gvk := event.InvolvedObject.GroupVersionKind() + if gvk.Group == "nodeops.k8s.preferred.jp" && + gvk.Kind == "NodeOperation" && + event.InvolvedObject.Name == nodeOp.Name && event.Type == corev1.EventTypeNormal && - event.Reason == "NodeIsNotRemediated" { + event.Reason == "JobCompletedButNotRemediated" { return true } } From a90bfb12736e20e76230067561151dd17c89be3b Mon Sep 17 00:00:00 2001 From: Ryota Arai Date: Thu, 22 Dec 2022 11:29:03 +0900 Subject: [PATCH 4/4] Keep operationsCount status. --- api/v1alpha1/noderemediation_types.go | 2 ++ .../nodeops.k8s.preferred.jp_noderemediations.yaml | 7 +++++++ controllers/noderemediation_controller.go | 13 +++++++++++++ 3 files changed, 22 insertions(+) diff --git a/api/v1alpha1/noderemediation_types.go b/api/v1alpha1/noderemediation_types.go index c84d5f2..685f629 100644 --- a/api/v1alpha1/noderemediation_types.go +++ b/api/v1alpha1/noderemediation_types.go @@ -60,6 +60,8 @@ type NodeRemediationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file ActiveNodeOperation corev1.ObjectReference `json:"activeNodeOperation,omitempty"` + // OperationsCount is num of NodeOperations executed by the NodeRemediation. Once the Node is remediated, this count will be reset to 0. + OperationsCount int64 `json:"operationsCount"` // NodeStatus represents whether Node should be remediated or not. NodeStatus NodeStatus `json:"nodeStatus"` } diff --git a/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml b/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml index 2644893..0dd34e8 100644 --- a/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml +++ b/config/crd/bases/nodeops.k8s.preferred.jp_noderemediations.yaml @@ -107,8 +107,15 @@ spec: description: NodeStatus represents whether Node should be remediated or not. type: string + operationsCount: + description: OperationsCount is num of NodeOperations executed by + the NodeRemediation. Once the Node is remediated, this count will + be reset to 0. + format: int64 + type: integer required: - nodeStatus + - operationsCount type: object type: object served: true diff --git a/controllers/noderemediation_controller.go b/controllers/noderemediation_controller.go index 3eadd04..15af282 100644 --- a/controllers/noderemediation_controller.go +++ b/controllers/noderemediation_controller.go @@ -126,6 +126,18 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.eventRecorder.Eventf(&remediation, corev1.EventTypeNormal, "UnknownNodeStatus", "Because at least one Node condition is unknown status, remediation process is skipped") return ctrl.Result{}, nil case nodeopsv1alpha1.NodeStatusOK: + // reset OperationsCount + remediation.Status.OperationsCount = 0 + if err := r.Status().Update(ctx, &remediation); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + // Avoid to create too many NodeOperations + if 0 < remediation.Status.OperationsCount { + // TODO: backoff feature. We can calculate the next backoff-ed trial timestamp from the counter value and the latest child NodeOperation completion timestamp + r.eventRecorder.Eventf(&remediation, corev1.EventTypeNormal, "NodeIsNotRemediated", `Though a NodeOperation has finished, the Node is not remediated. Skipping to create a NodeOperation.`) return ctrl.Result{}, nil } @@ -165,6 +177,7 @@ func (r *NodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } remediation.Status.ActiveNodeOperation = *ref + remediation.Status.OperationsCount++ if err := r.Status().Update(ctx, &remediation); err != nil { return ctrl.Result{}, err }