Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server side problem analysis #793

Merged
merged 9 commits into from
Feb 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions go/inst/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,16 @@ type Instance struct {
UnresolvedHostname string
AllowTLS bool

Problems []string

LastDiscoveryLatency time.Duration
}

// NewInstance creates a new, empty instance
func NewInstance() *Instance {
return &Instance{
SlaveHosts: make(map[InstanceKey]bool),
Problems: []string{},
}
}

Expand Down
21 changes: 18 additions & 3 deletions go/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,21 @@ func readInstanceRow(m sqlutils.RowMap) *Instance {

instance.SlaveHosts.ReadJson(slaveHostsJSON)
instance.applyFlavorName()

// problems
if !instance.IsLastCheckValid {
instance.Problems = append(instance.Problems, "last_check_invalid")
} else if !instance.IsRecentlyChecked {
instance.Problems = append(instance.Problems, "not_recently_checked")
} else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() {
instance.Problems = append(instance.Problems, "not_replicating")
} else if instance.SlaveLagSeconds.Valid && instance.SlaveLagSeconds.Int64 > int64(config.Config.ReasonableReplicationLagSeconds) {
instance.Problems = append(instance.Problems, "replication_lag")
}
if instance.GtidErrant != "" {
instance.Problems = append(instance.Problems, "errant_gtid")
}

return instance
}

Expand Down Expand Up @@ -1267,15 +1282,15 @@ func ReadProblemInstances(clusterName string) ([](*Instance), error) {
and (
(last_seen < last_checked)
or (unix_timestamp() - unix_timestamp(last_checked) > ?)
or (replication_sql_thread_state != 1)
or (replication_io_thread_state != 1)
or (replication_sql_thread_state not in (-1 ,1))
or (replication_io_thread_state not in (-1 ,1))
or (abs(cast(seconds_behind_master as signed) - cast(sql_delay as signed)) > ?)
or (abs(cast(slave_lag_seconds as signed) - cast(sql_delay as signed)) > ?)
or (gtid_errant != '')
)
`

args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds)
args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds)
instances, err := readInstancesByCondition(condition, args, "")
if err != nil {
return instances, err
Expand Down
12 changes: 12 additions & 0 deletions resources/public/css/orchestrator.css
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,15 @@ body {
color: #ffffff;
}

.instance h3.label-errant {
background-color: #b0b0b0;
color: #ffffff;
}

.instance h3.label-errant .glyphicon {
color: #ffffff;
}

.instance h3.label-fatal {
background-color: #000000;
color: #ffffff;
Expand Down Expand Up @@ -233,6 +242,9 @@ body {
background-color: #000000;
}

.instance .badge.label-errant {
background-color: #b0b0b0;
}

.instance .badge.label-primary {
background-color: #428BCA;
Expand Down
27 changes: 27 additions & 0 deletions resources/public/js/cluster-analysis-shared.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,30 @@ var interestingAnalysis = {
"UnreachableIntermediateMaster" : true,
"BinlogServerFailingToConnectToMaster" : true,
};

var errorMapping = {
"in_maintenance": {
"badge": "label-info",
"description": "In maintenance"
},
"last_check_invalid": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"not_recently_checked": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"not_replicating": {
"badge": "label-danger",
"description": "Not replicating"
},
"replication_lag": {
"badge": "label-warning",
"description": "Replication lag"
},
"errant_gtid": {
"badge": "label-errant",
"description": "Errant GTID"
}
};
40 changes: 4 additions & 36 deletions resources/public/js/cluster-pools.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,6 @@ $(document).ready(function() {

showLoader();

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

$.get(appUrl("/api/cluster-pool-instances/" + currentClusterName()), function(clusterPoolInstances) {
$.get(appUrl("/api/problems"), function(problemInstances) {
problemInstances = problemInstances || [];
Expand Down Expand Up @@ -71,6 +48,9 @@ $(document).ready(function() {
}

function incrementPoolsProblems(instance, problemType) {
if (!problemType) {
return
}
if (typeof instance.problemHint === 'undefined') {
instance.problemHint = problemType
}
Expand All @@ -83,19 +63,7 @@ $(document).ready(function() {
});
}
problemInstances.forEach(function(instance) {
if (instance.inMaintenanceProblem()) {
incrementPoolsProblems(instance, "inMaintenanceProblem")
}
//
if (instance.lastCheckInvalidProblem()) {
incrementPoolsProblems(instance, "lastCheckInvalidProblem")
} else if (instance.notRecentlyCheckedProblem()) {
incrementPoolsProblems(instance, "notRecentlyCheckedProblem")
} else if (instance.notReplicatingProblem()) {
incrementPoolsProblems(instance, "notReplicatingProblem")
} else if (instance.replicationLagProblem()) {
incrementPoolsProblems(instance, "replicationLagProblem")
}
incrementPoolsProblems(instance, instance.problem)
});

pools.forEach(function(pool) {
Expand Down
18 changes: 4 additions & 14 deletions resources/public/js/cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -1210,18 +1210,9 @@ function Cluster() {
instanceDescription += ", " + instance.SlaveLagSeconds.Int64 + "s lag";
incrementProblems("", instanceDescription)
instanceFullNames.push(getInstanceTitle(instance.Key.Hostname, instance.Key.Port));
if (instance.inMaintenanceProblem()) {
incrementProblems("inMaintenanceProblem", instanceDescription)
}
if (instance.lastCheckInvalidProblem()) {
incrementProblems("lastCheckInvalidProblem", instanceDescription)
} else if (instance.notRecentlyCheckedProblem()) {
incrementProblems("notRecentlyCheckedProblem", instanceDescription)
} else if (instance.notReplicatingProblem()) {
incrementProblems("notReplicatingProblem", instanceDescription)
} else if (instance.replicationLagProblem()) {
incrementProblems("replicationLagProblem", instanceDescription)
}
instance.Problems.forEach(function(problem) {
incrementProblems(problem, instanceDescription)
});
});
var aggergateInstance = instances[0];
aggergateInstance.isAggregate = true;
Expand Down Expand Up @@ -1396,7 +1387,7 @@ function Cluster() {
}
wrappedContent = '<div data-tag="'+tag+'">' + content + '<div style="clear: both;"></div></div>';
if (tag === "analysis") {
$(wrappedContent).insertAfter("#cluster_info [data-tag=glyphs]")
$("#cluster_info").append(wrappedContent)
} else {
$("#cluster_info").append(wrappedContent)
}
Expand Down Expand Up @@ -1532,7 +1523,6 @@ function Cluster() {
analysisContent += "<div>" + analysisEntry.AnalyzedInstanceKey.Hostname + ":" + analysisEntry.AnalyzedInstanceKey.Port + "</div>";
var content = '<div><div class="pull-left">'+glyph+'</div><div class="pull-right">'+analysisContent+'</div></div>';
addSidebarInfoPopoverContent(content, "analysis", false);

if (analysisEntry.IsStructureAnalysis) {
return;
}
Expand Down
40 changes: 4 additions & 36 deletions resources/public/js/clusters.js
Original file line number Diff line number Diff line change
@@ -1,29 +1,6 @@
$(document).ready(function() {
showLoader();

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

$.get(appUrl("/api/clusters-info"), function(clusters) {
$.get(appUrl("/api/replication-analysis"), function(replicationAnalysis) {
$.get(appUrl("/api/problems"), function(problemInstances) {
Expand Down Expand Up @@ -76,26 +53,17 @@ $(document).ready(function() {
}

function incrementClusterProblems(clusterName, problemType) {
if (!problemType) {
return
}
if (clustersProblems[clusterName][problemType] > 0) {
clustersProblems[clusterName][problemType] = clustersProblems[clusterName][problemType] + 1;
} else {
clustersProblems[clusterName][problemType] = 1;
}
}
problemInstances.forEach(function(instance) {
if (instance.inMaintenanceProblem()) {
incrementClusterProblems(instance.ClusterName, "inMaintenanceProblem")
}
//
if (instance.lastCheckInvalidProblem()) {
incrementClusterProblems(instance.ClusterName, "lastCheckInvalidProblem")
} else if (instance.notRecentlyCheckedProblem()) {
incrementClusterProblems(instance.ClusterName, "notRecentlyCheckedProblem")
} else if (instance.notReplicatingProblem()) {
incrementClusterProblems(instance.ClusterName, "notReplicatingProblem")
} else if (instance.replicationLagProblem()) {
incrementClusterProblems(instance.ClusterName, "replicationLagProblem")
}
incrementClusterProblems(instance.ClusterName, instance.problem)
});

clusters.forEach(function(cluster) {
Expand Down
51 changes: 16 additions & 35 deletions resources/public/js/orchestrator.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,6 @@ reloadPageHint = {
port: ""
}

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

function updateCountdownDisplay() {
if ($.cookie("auto-refresh") == "true") {
$("#refreshCountdown").html('<span class="glyphicon glyphicon-repeat" title="Click to pause"></span> ' + secondsTillRefresh + 's');
Expand Down Expand Up @@ -659,50 +636,54 @@ function normalizeInstance(instance) {
}

function normalizeInstanceProblem(instance) {

function instanceProblemIfExists(problemName) {
if (instance.Problems.includes(problemName)) {
return problemName
}
return null;
}
instance.inMaintenanceProblem = function() {
return instance.inMaintenance;
return instanceProblemIfExists('in_maintenance');
}
instance.lastCheckInvalidProblem = function() {
return !instance.IsLastCheckValid;
return instanceProblemIfExists('last_check_invalid');
}
instance.notRecentlyCheckedProblem = function() {
return !instance.IsRecentlyChecked;
return instanceProblemIfExists('not_recently_checked');
}
instance.notReplicatingProblem = function() {
return !instance.replicationRunning && !(instance.isMaster && !instance.isCoMaster);
return instanceProblemIfExists('not_replicating');
}
instance.replicationLagProblem = function() {
return !instance.replicationLagReasonable;
return instanceProblemIfExists('replication_lag');
}
instance.errantGTIDProblem = function() {
return (instance.GtidErrant != '');
return instanceProblemIfExists('errant_gtid');
}

instance.problem = null;
if (instance.Problems.length > 0) {
instance.problem = instance.Problems[0]; // highest priority one
}
instance.problemOrder = 0;
if (instance.inMaintenanceProblem()) {
instance.problem = "in_maintenance";
instance.problemDescription = "This instance is now under maintenance due to some pending operation.\nSee audit page";
instance.problemOrder = 1;
} else if (instance.lastCheckInvalidProblem()) {
instance.problem = "last_check_invalid";
instance.problemDescription = "Instance cannot be reached by orchestrator.\nIt might be dead or there may be a network problem";
instance.problemOrder = 2;
} else if (instance.notRecentlyCheckedProblem()) {
instance.problem = "not_recently_checked";
instance.problemDescription = "Orchestrator has not made an attempt to reach this instance for a while now.\nThis should generally not happen; consider refreshing or re-discovering this instance";
instance.problemOrder = 3;
} else if (instance.notReplicatingProblem()) {
// check replicas only; where not replicating
instance.problem = "not_replicating";
instance.problemDescription = "Replication is not running.\nEither stopped manually or is failing on I/O or SQL error.";
instance.problemOrder = 4;
} else if (instance.replicationLagProblem()) {
instance.problem = "replication_lag";
instance.problemDescription = "Replica is lagging.\nThis diagnostic is based on either Seconds_behind_master or configured ReplicationLagQuery";
instance.problemOrder = 5;
} else if (instance.errantGTIDProblem()) {
instance.problem = "Errant GTID";
instance.problemDescription = "Replica has GTID entries not found on its master";
instance.problemOrder = 6;
}
Expand Down