Skip to content

Commit

Permalink
controller: do not delete failed replica if there is no healthy replica
Browse files Browse the repository at this point in the history
Longhorn 7357

Signed-off-by: Derek Su <derek.su@suse.com>
  • Loading branch information
derekbit authored and David Ko committed Dec 24, 2023
1 parent f53d7cc commit 46af771
Showing 1 changed file with 21 additions and 11 deletions.
32 changes: 21 additions & 11 deletions controller/volume_controller.go
Expand Up @@ -944,27 +944,30 @@ func (c *VolumeController) cleanupCorruptedOrStaleReplicas(v *longhorn.Volume, r
continue
}

if datastore.IsBackendStoreDriverV1(v.Spec.BackendStoreDriver) {
staled := false
if v.Spec.StaleReplicaTimeout > 0 && util.TimestampAfterTimeout(r.Spec.FailedAt,
time.Duration(int64(v.Spec.StaleReplicaTimeout*60))*time.Second) {
staled := false
if v.Spec.StaleReplicaTimeout > 0 &&
util.TimestampAfterTimeout(r.Spec.FailedAt, time.Duration(int64(v.Spec.StaleReplicaTimeout*60))*time.Second) {

staled = true
}
staled = true
}

if datastore.IsBackendStoreDriverV1(v.Spec.BackendStoreDriver) {
// 1. failed for multiple times or failed at rebuilding (`Spec.RebuildRetryCount` of a newly created rebuilding replica
// is `FailedReplicaMaxRetryCount`) before ever became healthy/ mode RW,
// 2. failed too long ago, became stale and unnecessary to keep around, unless we don't have any healthy replicas
// 3. failed for race condition at upgrading when waiting IM-r to start and it would never became healty
if (r.Spec.RebuildRetryCount >= scheduler.FailedReplicaMaxRetryCount) || (healthyCount != 0 && staled) || (r.Spec.Image != v.Status.CurrentImage) {
log.WithField("replica", r.Name).Info("Cleaning up corrupted, staled replica")
if err := c.deleteReplica(r, rs); err != nil {
return errors.Wrapf(err, "cannot cleanup staled replica %v", r.Name)
return errors.Wrapf(err, "cannot clean up staled replica %v", r.Name)
}
}
} else {
if err := c.deleteReplica(r, rs); err != nil {
return errors.Wrapf(err, "failed to cleanup staled replica %v", r.Name)
// TODO: check `staled` flag after v2 volume supports online replica rebuilding
if healthyCount != 0 {
if err := c.deleteReplica(r, rs); err != nil {
return errors.Wrapf(err, "failed to clean up staled replica %v", r.Name)
}
}
}
}
Expand Down Expand Up @@ -1264,11 +1267,16 @@ func (c *VolumeController) ReconcileVolumeState(v *longhorn.Volume, es map[strin
// At this moment, Longhorn goes into the IF statement below this IF statement and salvage all replicas.
if autoSalvage && !v.Status.IsStandby && !v.Status.RestoreRequired {
// Since all replica failed and autoSalvage is enable, mark engine controller salvage requested
e.Spec.SalvageRequested = true
log.Infof("All replicas are failed, set engine salvageRequested to %v", e.Spec.SalvageRequested)
// TODO: SalvageRequested is meanningless for v2 volume
if datastore.IsBackendStoreDriverV1(v.Spec.BackendStoreDriver) {
e.Spec.SalvageRequested = true
log.Infof("All replicas are failed, set engine salvageRequested to %v", e.Spec.SalvageRequested)
}
}
// make sure the volume is detached before automatically salvage replicas
if autoSalvage && v.Status.State == longhorn.VolumeStateDetached && !v.Status.IsStandby && !v.Status.RestoreRequired {
log.Info("All replicas are failed, auto-salvaging volume")

lastFailedAt := time.Time{}
failedUsableReplicas := map[string]*longhorn.Replica{}
dataExists := false
Expand Down Expand Up @@ -1317,6 +1325,8 @@ func (c *VolumeController) ReconcileVolumeState(v *longhorn.Volume, es map[strin
if !dataExists {
log.Warn("Failed to auto salvage volume: no data exists")
} else {
log.Info("Bringing up replicas for auto-salvage")

// This salvage is for revision counter enabled case
salvaged := false
// Bring up the replicas for auto-salvage
Expand Down

0 comments on commit 46af771

Please sign in to comment.