Skip to content

Commit

Permalink
Override master promotion: apply post-unsuccessful processes if overr…
Browse files Browse the repository at this point in the history
…iden
  • Loading branch information
Shlomi Noach committed Aug 4, 2019
1 parent c72a6c8 commit 724bec0
Showing 1 changed file with 25 additions and 16 deletions.
41 changes: 25 additions & 16 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -815,27 +815,36 @@ func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidate
promotedReplica, lostReplicas, err := recoverDeadMaster(topologyRecovery, candidateInstanceKey, skipProcesses)
topologyRecovery.LostReplicas.AddInstances(lostReplicas)

// And this is the end; whether successful or not, we're done.
resolveRecovery(topologyRecovery, promotedReplica)
if promotedReplica != nil {
if config.Config.FailMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() {
message := fmt.Sprintf("RecoverDeadMaster: failed promotion. FailMasterPromotionIfSQLThreadNotUpToDate is set and promoted replica %+v 's sql thread is not up to date (relay logs still unapplied). Aborting promotion", promotedReplica.Key)
AuditTopologyRecovery(topologyRecovery, message)
return false, nil, log.Error(message)
overrideMasterPromotion := func() (*inst.Instance, error) {
if promotedReplica == nil {
// No promotion; nothing to override.
return promotedReplica, err
}
// Scenarios where we might cancel the promotion.
if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&analysisEntry, promotedReplica); !satisfied {
message := fmt.Sprintf("RecoverDeadMaster: failed %+v promotion; %s", promotedReplica.Key, reason)
AuditTopologyRecovery(topologyRecovery, message)
return false, nil, log.Error(message)
return nil, fmt.Errorf("RecoverDeadMaster: failed %+v promotion; %s", promotedReplica.Key, reason)
}
if config.Config.DelayMasterPromotionIfSQLThreadNotUpToDate {
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Waiting to ensure the SQL thread catches up on %+v", promotedReplica.Key))
if _, err = inst.WaitForSQLThreadUpToDate(&promotedReplica.Key, 0, 0); err != nil {
return false, nil, err
if config.Config.FailMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() {
return nil, fmt.Errorf("RecoverDeadMaster: failed promotion. FailMasterPromotionIfSQLThreadNotUpToDate is set and promoted replica %+v 's sql thread is not up to date (relay logs still unapplied). Aborting promotion", promotedReplica.Key)
}
if config.Config.DelayMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() {
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("DelayMasterPromotionIfSQLThreadNotUpToDate: waiting for SQL thread on %+v", promotedReplica.Key))
if _, err := inst.WaitForSQLThreadUpToDate(&promotedReplica.Key, 0, 0); err != nil {
return nil, fmt.Errorf("DelayMasterPromotionIfSQLThreadNotUpToDate error: %+v", err)
}
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("SQL thread caught up on %+v", promotedReplica.Key))
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("DelayMasterPromotionIfSQLThreadNotUpToDate: SQL thread caught up on %+v", promotedReplica.Key))
}

// All seems well. No override done.
return promotedReplica, err
}
promotedReplica, err = overrideMasterPromotion()
if err != nil {
AuditTopologyRecovery(topologyRecovery, err.Error())
}
// And this is the end; whether successful or not, we're done.
resolveRecovery(topologyRecovery, promotedReplica)
// Now, see whether we are successful or not. From this point there's no going back.
if promotedReplica != nil {
// Success!
recoverDeadMasterSuccessCounter.Inc(1)
AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: successfully promoted %+v", promotedReplica.Key))
Expand Down

0 comments on commit 724bec0

Please sign in to comment.