Skip to content

Commit

Permalink
Adding failover flag
Browse files Browse the repository at this point in the history
  • Loading branch information
mszacillo committed Aug 14, 2024
1 parent 7413fca commit 5bd1cef
Show file tree
Hide file tree
Showing 13 changed files with 347 additions and 7 deletions.
26 changes: 26 additions & 0 deletions api/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -19896,6 +19896,24 @@
}
]
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem": {
"description": "FailoverHistoryItem represents information about how the resource has failed over from one cluster to another.",
"type": "object",
"properties": {
"failoverTime": {
"description": "FailoverTime represents the timestamp when the workload failed over. It is represented in RFC3339 form(like '2021-04-25T10:02:10Z') and is in UTC.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
},
"originCluster": {
"description": "OriginCluster denotes the name of the cluster from which the workload was failed over.",
"type": "string"
},
"reason": {
"description": "Reason denotes the reason why the workload failed over.",
"type": "string"
}
}
},
"com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.GracefulEvictionTask": {
"description": "GracefulEvictionTask represents a graceful eviction task.",
"type": "object",
Expand Down Expand Up @@ -20202,6 +20220,14 @@
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Condition"
}
},
"failoverHistory": {
"description": "FailoverHistory represents history of the previous failovers of this resource",
"type": "array",
"items": {
"default": {},
"$ref": "#/definitions/com.github.karmada-io.karmada.pkg.apis.work.v1alpha2.FailoverHistoryItem"
}
},
"lastScheduledTime": {
"description": "LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling. It is represented in RFC3339 form (like '2006-01-02T15:04:05Z') and is in UTC.",
"$ref": "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,29 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents history of the previous failovers
of this resource
items:
description: FailoverHistoryItem represents information about how
the resource has failed over from one cluster to another.
properties:
failoverTime:
description: |-
FailoverTime represents the timestamp when the workload failed over.
It is represented in RFC3339 form(like '2021-04-25T10:02:10Z') and is in UTC.
format: date-time
type: string
originCluster:
description: OriginCluster denotes the name of the cluster from
which the workload was failed over.
type: string
reason:
description: Reason denotes the reason why the workload failed
over.
type: string
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,29 @@ spec:
- type
type: object
type: array
failoverHistory:
description: FailoverHistory represents history of the previous failovers
of this resource
items:
description: FailoverHistoryItem represents information about how
the resource has failed over from one cluster to another.
properties:
failoverTime:
description: |-
FailoverTime represents the timestamp when the workload failed over.
It is represented in RFC3339 form(like '2021-04-25T10:02:10Z') and is in UTC.
format: date-time
type: string
originCluster:
description: OriginCluster denotes the name of the cluster from
which the workload was failed over.
type: string
reason:
description: Reason denotes the reason why the workload failed
over.
type: string
type: object
type: array
lastScheduledTime:
description: |-
LastScheduledTime representing the latest timestamp when scheduler successfully finished a scheduling.
Expand Down
17 changes: 17 additions & 0 deletions pkg/apis/work/v1alpha2/binding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,23 @@ type ResourceBindingStatus struct {
// AggregatedStatus represents status list of the resource running in each member cluster.
// +optional
AggregatedStatus []AggregatedStatusItem `json:"aggregatedStatus,omitempty"`

// FailoverHistory represents history of the previous failovers of this resource
// + optional
FailoverHistory []FailoverHistoryItem `json:"failoverHistory,omitempty"`
}

// FailoverHistoryItem represents information about how the resource has failed over from one cluster to another.
type FailoverHistoryItem struct {
// FailoverTime represents the timestamp when the workload failed over.
// It is represented in RFC3339 form(like '2021-04-25T10:02:10Z') and is in UTC.
FailoverTime *metav1.Time `json:"failoverTime,omitempty"`

// OriginCluster denotes the name of the cluster from which the workload was failed over.
OriginCluster string `json:"originCluster,omitempty"`

// Reason denotes the reason why the workload failed over.
Reason string `json:"reason,omitempty"`
}

// AggregatedStatusItem represents status of the resource running in a member cluster.
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/work/v1alpha2/well_known_constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,8 @@ const (
// EvictionProducerTaintManager represents the name of taint manager.
EvictionProducerTaintManager = "TaintManager"
)

// Define failover labels.
const (
ResourceBindingFailoverLabel = "resourcebinding.karmada.io/failover-type"
)
27 changes: 27 additions & 0 deletions pkg/apis/work/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
configv1alpha1 "github.com/karmada-io/karmada/pkg/apis/config/v1alpha1"
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
Expand Down Expand Up @@ -154,6 +155,11 @@ func (c *RBApplicationFailoverController) syncBinding(binding *workv1alpha2.Reso

func (c *RBApplicationFailoverController) evictBinding(binding *workv1alpha2.ResourceBinding, clusters []string) error {
for _, cluster := range clusters {
klog.V(4).Infof("Updating resource binding with latest failover timestamp for cluster %s.", cluster)

if err := controllerUtils.UpdateFailoverStatus(c.Client, binding, cluster, workv1alpha2.EvictionReasonApplicationFailure); err != nil {
klog.Errorf("Failed to update status with failover information.")
}
switch binding.Spec.Failover.Application.PurgeMode {
case policyv1alpha1.Graciously:
if features.FeatureGate.Enabled(features.GracefulEviction) {
Expand Down
34 changes: 34 additions & 0 deletions pkg/controllers/binding/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,43 @@ func mergeTargetClusters(targetClusters []workv1alpha2.TargetCluster, requiredBy
return targetClusters
}

func checkFailoverCondition(resourceBinding *workv1alpha2.ResourceBinding) string {
failoverHistory := resourceBinding.Status.FailoverHistory
klog.V(4).Infof("Failover History is %+v", failoverHistory)
if len(failoverHistory) == 0 {
return ""
}

lastFailover := failoverHistory[len(failoverHistory)-1]
klog.V(4).Infof("Latest failover item is %+v", lastFailover) // AD: check
if lastFailover.Reason == "ClusterFailover" {
klog.V(4).Info("The latest failover was due to a ClusterFailover.")
return workv1alpha2.EvictionReasonTaintUntolerated
}
if lastFailover.Reason == "ApplicationFailover" {
klog.V(4).Info("The latest failover was due to a ApplicationFailover.")
return workv1alpha2.EvictionReasonApplicationFailure
}
return ""
}

func mergeLabel(workload *unstructured.Unstructured, binding metav1.Object, scope apiextensionsv1.ResourceScope) map[string]string {
var workLabel = make(map[string]string)
if scope == apiextensionsv1.NamespaceScoped {
klog.V(4).Info("Checking for failover condition.")
namespaceBindingObj := binding.(*workv1alpha2.ResourceBinding)
failoverReason := checkFailoverCondition(namespaceBindingObj)
if failoverReason != "" {
if failoverReason == workv1alpha2.EvictionReasonApplicationFailure {
klog.V(4).Info("Appending application failover label")
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "application")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "application"
} else if failoverReason == workv1alpha2.EvictionReasonTaintUntolerated {
klog.V(4).Info("Appending cluster failover label")
util.MergeLabel(workload, workv1alpha2.ResourceBindingFailoverLabel, "cluster")
workLabel[workv1alpha2.ResourceBindingFailoverLabel] = "cluster"
}
}
bindingID := util.GetLabelValue(binding.GetLabels(), workv1alpha2.ResourceBindingPermanentIDLabel)
util.MergeLabel(workload, workv1alpha2.ResourceBindingPermanentIDLabel, bindingID)
workLabel[workv1alpha2.ResourceBindingPermanentIDLabel] = bindingID
Expand Down
6 changes: 6 additions & 0 deletions pkg/controllers/cluster/taint_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
controllerUtils "github.com/karmada-io/karmada/pkg/controllers/utils"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/fedinformer/keys"
Expand Down Expand Up @@ -170,6 +171,11 @@ func (tc *NoExecuteTaintManager) syncBindingEviction(key util.QueueKey) error {
// Case 2: Need eviction after toleration time. If time is up, do eviction right now.
// Case 3: Tolerate forever, we do nothing.
if needEviction || tolerationTime == 0 {
klog.V(4).Infof("Updating resource binding: %s with latest failover information %s.", binding.Name, cluster)
updateErr := controllerUtils.UpdateFailoverStatus(tc.Client, binding, cluster, workv1alpha2.EvictionReasonTaintUntolerated)
if updateErr != nil {
klog.Errorf("Failed to update status with failover information")
}
// update final result to evict the target cluster
if features.FeatureGate.Enabled(features.GracefulEviction) {
binding.Spec.GracefulEvictCluster(cluster, workv1alpha2.NewTaskOptions(workv1alpha2.WithProducer(workv1alpha2.EvictionProducerTaintManager), workv1alpha2.WithReason(workv1alpha2.EvictionReasonTaintUntolerated)))
Expand Down
114 changes: 114 additions & 0 deletions pkg/controllers/utils/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
Copyright 2020 The Karmada Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package utils

import (
"context"
"fmt"

"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/karmada-io/karmada/pkg/util/helper"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
)

func restrictFailoverHistoryInfo(binding *workv1alpha2.ResourceBinding) bool {
placement := binding.Spec.Placement
// Check if replica scheduling type is Duplicated
if placement.ReplicaScheduling.ReplicaSchedulingType == policyv1alpha1.ReplicaSchedulingTypeDuplicated {
return true
}

// Check if replica scheduling type is Divided with no spread constraints or invalid spread constraints
if placement.ReplicaScheduling.ReplicaSchedulingType == policyv1alpha1.ReplicaSchedulingTypeDivided {
if len(placement.SpreadConstraints) == 0 {
return true
}

for _, spreadConstraint := range placement.SpreadConstraints {
if spreadConstraint.SpreadByLabel != "" {
return true
}
if spreadConstraint.SpreadByField == "cluster" && (spreadConstraint.MaxGroups > 1 || spreadConstraint.MinGroups > 1) {
return true
}
}
}

return false
}

func UpdateFailoverStatus(client client.Client, binding *workv1alpha2.ResourceBinding, cluster string, failoverType string) (err error) {

Check failure on line 59 in pkg/controllers/utils/common.go

View workflow job for this annotation

GitHub Actions / lint

exported: exported function UpdateFailoverStatus should have comment or be unexported (revive)
if restrictFailoverHistoryInfo(binding) {
return nil
}
message := fmt.Sprintf("Failover triggered for replica on cluster %s", cluster)

var reason string
if failoverType == workv1alpha2.EvictionReasonApplicationFailure {
reason = "ApplicationFailover"
} else if failoverType == workv1alpha2.EvictionReasonTaintUntolerated {
reason = "ClusterFailover"
} else {
errMsg := "Invalid failover type passed into updateFailoverStatus"
klog.Errorf(errMsg)
return fmt.Errorf(errMsg)
}

newFailoverAppliedCondition := metav1.Condition{
Type: failoverType,
Status: metav1.ConditionTrue,
Reason: reason,
Message: message,
LastTransitionTime: metav1.Now(),
}

err = retry.RetryOnConflict(retry.DefaultRetry, func() (err error) {
_, err = helper.UpdateStatus(context.Background(), client, binding, func() error {
// set binding status with the newest condition
currentTime := metav1.Now()
failoverHistoryItem := workv1alpha2.FailoverHistoryItem{
FailoverTime: &currentTime,
OriginCluster: cluster,
Reason: reason,
}
binding.Status.FailoverHistory = append(binding.Status.FailoverHistory, failoverHistoryItem)
klog.V(4).Infof("Failover history is %+v", binding.Status.FailoverHistory)
existingCondition := meta.FindStatusCondition(binding.Status.Conditions, failoverType)
if existingCondition != nil && newFailoverAppliedCondition.Message == existingCondition.Message { //check
// SetStatusCondition only updates if new status differs from the old status
// Update the time here as the status will not change if multiple failovers of the same failoverType occur
existingCondition.LastTransitionTime = metav1.Now()
} else {
meta.SetStatusCondition(&binding.Status.Conditions, newFailoverAppliedCondition)
}
klog.V(4).Infof("Removing cluster %s from binding. Remaining clusters are %+v", cluster, binding.Spec.Clusters)
return nil
})
return err
})

if err != nil {
klog.Errorf("Failed to update condition of binding %s/%s: %s", binding.Namespace, binding.Name, err.Error())
return err
}
return nil
}
Loading

0 comments on commit 5bd1cef

Please sign in to comment.