Skip to content

Commit

Permalink
Add Instance Alias and DetectInstanceAliasQuery.
Browse files Browse the repository at this point in the history
The Instance Alias is an optional label given to each instance
by the admin that isn't used by Orchestrator, except that it can pass it
to external tools through hooks.

The {successorAlias} placeholder for hooks gives the Instance Alias of
the successor, if any.
  • Loading branch information
enisoc authored and Shlomi Noach committed Apr 13, 2016
1 parent 13a17a8 commit 5b5581b
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 16 deletions.
1 change: 1 addition & 0 deletions conf/orchestrator-sample.conf.json
Expand Up @@ -65,6 +65,7 @@
"SlaveLagQuery": "",
"DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",
"DetectClusterDomainQuery": "",
"DetectInstanceAliasQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
Expand Down
10 changes: 6 additions & 4 deletions go/config/config.go
Expand Up @@ -100,6 +100,7 @@ type Configuration struct {
ClusterNameToAlias map[string]string // map between regex matching cluster name to a human friendly alias
DetectClusterAliasQuery string // Optional query (executed on topology instance) that returns the alias of a cluster. Query will only be executed on cluster master (though until the topology's master is resovled it may execute on other/all slaves). If provided, must return one row, one column
DetectClusterDomainQuery string // Optional query (executed on topology instance) that returns the VIP/CNAME/Alias/whatever domain name for the master of this cluster. Query will only be executed on cluster master (though until the topology's master is resovled it may execute on other/all slaves). If provided, must return one row, one column
DetectInstanceAliasQuery string // Optional query (executed on topology instance) that returns the alias of an instance. If provided, must return one row, one column
DataCenterPattern string // Regexp pattern with one group, extracting the datacenter name from the hostname
PhysicalEnvironmentPattern string // Regexp pattern with one group, extracting physical environment info from hostname (e.g. combination of datacenter & prod/dev env)
DetectDataCenterQuery string // Optional query (executed on topology instance) that returns the data center of an instance. If provided, must return one row, one column. Overrides DataCenterPattern and useful for installments where DC cannot be inferred by hostname
Expand Down Expand Up @@ -147,10 +148,10 @@ type Configuration struct {
RecoveryIgnoreHostnameFilters []string // Recovery analysis will completely ignore hosts matching given patterns
RecoverMasterClusterFilters []string // Only do master recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
RecoverIntermediateMasterClusterFilters []string // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything)
OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countSlaves}, {slaveHosts}, {isDowntimed}, {autoMasterRecovery}, {autoIntermediateMasterRecovery}
PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countSlaves}, {slaveHosts}, {isDowntimed}
PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countSlaves}, {slaveHosts}, {isDowntimed}, {isSuccessful}, {lostSlaves}
PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countSlaves}, {slaveHosts}, {isDowntimed}, {isSuccessful}, {lostSlaves}
OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countSlaves}, {slaveHosts}, {isDowntimed}, {autoMasterRecovery}, {autoIntermediateMasterRecovery}
PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countSlaves}, {slaveHosts}, {isDowntimed}
PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countSlaves}, {slaveHosts}, {isDowntimed}, {isSuccessful}, {lostSlaves}
PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {failureDescription}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countSlaves}, {slaveHosts}, {isDowntimed}, {isSuccessful}, {lostSlaves}
PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses
CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail
Expand Down Expand Up @@ -238,6 +239,7 @@ func newConfiguration() *Configuration {
ClusterNameToAlias: make(map[string]string),
DetectClusterAliasQuery: "",
DetectClusterDomainQuery: "",
DetectInstanceAliasQuery: "",
DataCenterPattern: "",
PhysicalEnvironmentPattern: "",
DetectDataCenterQuery: "",
Expand Down
18 changes: 15 additions & 3 deletions go/db/db.go
Expand Up @@ -500,6 +500,8 @@ var generateSQLBase = []string{
`,
}

// generateSQLPatches contains DDLs for patching schema to the latest version.
// Add new statements at the end of the list so they form a changelog.
var generateSQLPatches = []string{
`
ALTER TABLE
Expand Down Expand Up @@ -799,15 +801,25 @@ var generateSQLPatches = []string{
ADD COLUMN has_replication_credentials TINYINT UNSIGNED NOT NULL
`,
`
ALTER TABLE
database_instance
ADD COLUMN allow_tls TINYINT UNSIGNED NOT NULL AFTER sql_delay
ALTER TABLE
database_instance
ADD COLUMN allow_tls TINYINT UNSIGNED NOT NULL AFTER sql_delay
`,
`
ALTER TABLE
database_instance
ADD COLUMN semi_sync_enforced TINYINT UNSIGNED NOT NULL AFTER physical_environment
`,
`
ALTER TABLE
database_instance
ADD COLUMN instance_alias varchar(128) CHARACTER SET ascii NOT NULL AFTER physical_environment
`,
`
ALTER TABLE
topology_recovery
ADD COLUMN successor_alias varchar(128) DEFAULT NULL
`,
}

// Track if a TLS has already been configured for topology
Expand Down
1 change: 1 addition & 0 deletions go/inst/instance.go
Expand Up @@ -42,6 +42,7 @@ const (
// It presents important replication configuration and detailed replication status.
type Instance struct {
Key InstanceKey
InstanceAlias string
Uptime uint
ServerID uint
ServerUUID string
Expand Down
15 changes: 12 additions & 3 deletions go/inst/instance_dao.go
Expand Up @@ -441,6 +441,11 @@ func ReadTopologyInstance(instanceKey *InstanceKey) (*Instance, error) {
logReadTopologyInstanceError(instanceKey, "DetectPhysicalEnvironmentQuery", err)
}

if config.Config.DetectInstanceAliasQuery != "" && !isMaxScale {
err := db.QueryRow(config.Config.DetectInstanceAliasQuery).Scan(&instance.InstanceAlias)
logReadTopologyInstanceError(instanceKey, "DetectInstanceAliasQuery", err)
}

if config.Config.DetectSemiSyncEnforcedQuery != "" && !isMaxScale {
err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncEnforced)
logReadTopologyInstanceError(instanceKey, "DetectSemiSyncEnforcedQuery", err)
Expand Down Expand Up @@ -651,6 +656,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance {
instance.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp")
instance.UnresolvedHostname = m.GetString("unresolved_hostname")
instance.AllowTLS = m.GetBool("allow_tls")
instance.InstanceAlias = m.GetString("instance_alias")

instance.SlaveHosts.ReadJson(slaveHostsJSON)
return instance
Expand Down Expand Up @@ -1627,7 +1633,8 @@ func writeInstance(instance *Instance, instanceWasActuallyFound bool, lastError
replication_credentials_available=VALUES(replication_credentials_available),
has_replication_credentials=VALUES(has_replication_credentials),
allow_tls=VALUES(allow_tls),
semi_sync_enforced=VALUES(semi_sync_enforced)
semi_sync_enforced=VALUES(semi_sync_enforced),
instance_alias=VALUES(instance_alias)
`
} else {
// Scenario: some slave reported a master of his; but the master cannot be contacted.
Expand Down Expand Up @@ -1685,8 +1692,9 @@ func writeInstance(instance *Instance, instanceWasActuallyFound bool, lastError
replication_credentials_available,
has_replication_credentials,
allow_tls,
semi_sync_enforced
) values (?, ?, NOW(), NOW(), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
semi_sync_enforced,
instance_alias
) values (?, ?, NOW(), NOW(), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
%s
`, insertIgnore, onDuplicateKeyUpdate)

Expand Down Expand Up @@ -1738,6 +1746,7 @@ func writeInstance(instance *Instance, instanceWasActuallyFound bool, lastError
instance.HasReplicationCredentials,
instance.AllowTLS,
instance.SemiSyncEnforced,
instance.InstanceAlias,
)
if err != nil {
return log.Errore(err)
Expand Down
15 changes: 11 additions & 4 deletions go/logic/topology_recovery.go
Expand Up @@ -18,6 +18,10 @@ package logic

import (
"fmt"
"sort"
"strings"
"time"

"github.com/outbrain/golib/log"
"github.com/outbrain/orchestrator/go/attributes"
"github.com/outbrain/orchestrator/go/config"
Expand All @@ -26,9 +30,6 @@ import (
"github.com/outbrain/orchestrator/go/process"
"github.com/pmylund/go-cache"
"github.com/rcrowley/go-metrics"
"sort"
"strings"
"time"
)

// BlockedTopologyRecovery represents an entry in the blocked_topology_recovery table
Expand All @@ -47,6 +48,7 @@ type TopologyRecovery struct {
Id int64
AnalysisEntry inst.ReplicationAnalysis
SuccessorKey *inst.InstanceKey
SuccessorAlias string
IsActive bool
IsSuccessful bool
LostSlaves inst.InstanceKeyMap
Expand Down Expand Up @@ -136,7 +138,7 @@ func init() {
metrics.Register("recover.dead_co_master.fail", recoverDeadCoMasterFailureCounter)
}

// replaceCommandPlaceholders replaxces agreed-upon placeholders with analysis data
// replaceCommandPlaceholders replaces agreed-upon placeholders with analysis data
func replaceCommandPlaceholders(command string, topologyRecovery *TopologyRecovery) string {
analysisEntry := &topologyRecovery.AnalysisEntry
command = strings.Replace(command, "{failureType}", string(analysisEntry.Analysis), -1)
Expand All @@ -156,6 +158,9 @@ func replaceCommandPlaceholders(command string, topologyRecovery *TopologyRecove
if topologyRecovery.SuccessorKey != nil {
command = strings.Replace(command, "{successorHost}", topologyRecovery.SuccessorKey.Hostname, -1)
command = strings.Replace(command, "{successorPort}", fmt.Sprintf("%d", topologyRecovery.SuccessorKey.Port), -1)
// As long as SucesssorKey != nil, we replace {successorAlias}.
// If SucessorAlias is "", it's fine. We'll replace {successorAlias} with "".
command = strings.Replace(command, "{successorAlias}", topologyRecovery.SuccessorAlias, -1)
}

command = strings.Replace(command, "{lostSlaves}", topologyRecovery.LostSlaves.ToCommaDelimitedList(), -1)
Expand Down Expand Up @@ -728,6 +733,7 @@ func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysi
if !skipProcesses {
// Execute post intermediate-master-failover processes
topologyRecovery.SuccessorKey = &promotedSlave.Key
topologyRecovery.SuccessorAlias = promotedSlave.InstanceAlias
executeProcesses(config.Config.PostIntermediateMasterFailoverProcesses, "PostIntermediateMasterFailoverProcesses", topologyRecovery, false)
}
} else {
Expand Down Expand Up @@ -888,6 +894,7 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida
if !skipProcesses {
// Execute post intermediate-master-failover processes
topologyRecovery.SuccessorKey = &promotedSlave.Key
topologyRecovery.SuccessorAlias = promotedSlave.InstanceAlias
executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false)
}
} else {
Expand Down
10 changes: 8 additions & 2 deletions go/logic/topology_recovery_dao.go
Expand Up @@ -18,13 +18,14 @@ package logic

import (
"fmt"
"strings"

"github.com/outbrain/golib/log"
"github.com/outbrain/golib/sqlutils"
"github.com/outbrain/orchestrator/go/config"
"github.com/outbrain/orchestrator/go/db"
"github.com/outbrain/orchestrator/go/inst"
"github.com/outbrain/orchestrator/go/process"
"strings"
)

// AttemptFailureDetectionRegistration tries to add a failure-detection entry; if this fails that means the problem has already been detected
Expand Down Expand Up @@ -360,8 +361,10 @@ func ResolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst

isSuccessful := false
var successorKeyToWrite inst.InstanceKey
var successorAliasToWrite string
if successorInstance != nil {
topologyRecovery.SuccessorKey = &successorInstance.Key
topologyRecovery.SuccessorAlias = successorInstance.InstanceAlias
isSuccessful = true
successorKeyToWrite = successorInstance.Key
}
Expand All @@ -370,6 +373,7 @@ func ResolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst
is_successful = ?,
successor_hostname = ?,
successor_port = ?,
successor_alias = ?,
lost_slaves = ?,
participating_instances = ?,
all_errors = ?,
Expand All @@ -380,7 +384,7 @@ func ResolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst
AND processing_node_hostname = ?
AND processcing_node_token = ?
`, isSuccessful, successorKeyToWrite.Hostname, successorKeyToWrite.Port,
topologyRecovery.LostSlaves.ToCommaDelimitedList(),
successorAliasToWrite, topologyRecovery.LostSlaves.ToCommaDelimitedList(),
topologyRecovery.ParticipatingInstanceKeys.ToCommaDelimitedList(),
strings.Join(topologyRecovery.AllErrors, "\n"),
topologyRecovery.Id, process.ThisHostname, process.ProcessToken.Hash,
Expand All @@ -405,6 +409,7 @@ func readRecoveries(whereCondition string, limit string, args []interface{}) ([]
processcing_node_token,
ifnull(successor_hostname, '') as successor_hostname,
ifnull(successor_port, 0) as successor_port,
ifnull(successor_alias, '') as successor_alias,
analysis,
cluster_name,
cluster_alias,
Expand Down Expand Up @@ -447,6 +452,7 @@ func readRecoveries(whereCondition string, limit string, args []interface{}) ([]
topologyRecovery.SuccessorKey = &inst.InstanceKey{}
topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname")
topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port")
topologyRecovery.SuccessorAlias = m.GetString("successor_alias")

topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo()

Expand Down
3 changes: 3 additions & 0 deletions resources/public/js/orchestrator.js
Expand Up @@ -248,6 +248,9 @@ function openNodeModal(node) {

$('#modalDataAttributesTable').html("");

if (node.InstanceAlias) {
addNodeModalDataAttribute("Instance Alias", node.InstanceAlias);
}
addNodeModalDataAttribute("Last seen", node.LastSeenTimestamp + " (" + node.SecondsSinceLastSeen.Int64 + "s ago)");
if (node.UnresolvedHostname) {
addNodeModalDataAttribute("Unresolved hostname", node.UnresolvedHostname);
Expand Down

0 comments on commit 5b5581b

Please sign in to comment.