Skip to content

Commit

Permalink
Adjust NodeClock* alerting rules to work with PTP operator
Browse files Browse the repository at this point in the history
This commit adapts the upstream NodeClockNotSynchronising and
NodeClockSkewDetected rules to be always inactive when the PTP operator
is installed. The PTP operator ships a more robust rule to detect
unsynchronised clocks and the default rules are redundant in this case.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
  • Loading branch information
simonpasquier committed Dec 6, 2023
1 parent 8377489 commit dc2c689
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 6 deletions.
8 changes: 6 additions & 2 deletions assets/node-exporter/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ spec:
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
expr: |-
(
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
Expand All @@ -185,6 +186,7 @@ spec:
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
) and on() absent(up{job="ptp-monitor-service"})
for: 10m
labels:
severity: warning
Expand All @@ -193,10 +195,12 @@ spec:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md
summary: Clock not synchronising.
expr: |
expr: |-
(
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
) and on() absent(up{job="ptp-monitor-service"})
for: 10m
labels:
severity: critical
Expand Down
50 changes: 48 additions & 2 deletions jsonnet/utils/sanitize-rules.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
local k8sMixinUtils = import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/utils.libsonnet';

// List of rule groups which are dropped from the final manifests.
local excludedRuleGroups = [
'kube-apiserver-availability.rules',
// rules managed by openshift/cluster-kube-controller-manager-operator.
Expand All @@ -16,18 +17,23 @@ local excludedRuleGroups = [
'kubernetes-system-kube-proxy',
];

// List of rules which are dropped from the final manifests.
local excludedRules = [
{
name: 'alertmanager.rules',
rules: [
// Already covered by the KubePodCrashLooping alerting rules.
{ alert: 'AlertmanagerClusterCrashlooping' },
//
{ alert: 'AlertmanagerClusterFailedToSendAlerts', severity: 'warning' },
],
},
{
name: 'general.rules',
rules: [
// CMO ships a modified TargetDown alerting rule which is less noisy than upstream.
{ alert: 'TargetDown' },
// We decided not to ship the InfoInhibitor alerting rule for now.
{ alert: 'InfoInhibitor' },
],
},
Expand Down Expand Up @@ -74,6 +80,7 @@ local excludedRules = [
// In addition we have alerts to detect that a Kubelet
// can't renew its certificates which makes it redundant
// to alert on certificates being almost expired.
//
// See https://coreos.slack.com/archives/CB48XQ4KZ/p1603712568136500.
{ alert: 'KubeletClientCertificateExpiration' },
{ alert: 'KubeletServerCertificateExpiration' },
Expand All @@ -82,16 +89,25 @@ local excludedRules = [
{
name: 'kubernetes-apps',
rules: [
// We ship a modified KubeDeploymentReplicasMismatch alerting rule which
// takes into account the availability of the control plane nodes.
{ alert: 'KubeDeploymentReplicasMismatch' },
],
},
{
name: 'prometheus',
rules: [
// PrometheusErrorSendingAlertsToAnyAlertmanager has a critical severity but it
// can be noisy and we prefer to rely on the Watchdog alerting rule to detect
// broken communication between Prometheus and Alertmanager. We keep the
// PrometheusErrorSendingAlertsToSomeAlertmanagers alerting rule with the
// warning severity to help with root cause.
//
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
{ alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager' },
],
},
// The following rules are removed due to lack of usefulness
// The following recording rules are removed due to lack of usefulness
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1996785 for details.
{
name: 'kube-prometheus-node-recording.rules',
Expand All @@ -116,6 +132,7 @@ local excludedRules = [
{
name: 'thanos-query',
rules: [
// We have no SLO on the Thanos querier API service.
{ alert: 'ThanosQueryInstantLatencyHigh' },
{ alert: 'ThanosQueryRangeLatencyHigh' },
],
Expand Down Expand Up @@ -180,12 +197,26 @@ local patchedRules = [
name: 'node-exporter',
rules: [
{
// When the PTP operator is installed, it provides a more reliable and accurate
// alerting rule to detect unsynchronized clocks. The NodeClockNotSynchronising
// alerting rule is patched to never become active in this case.
// See https://issues.redhat.com/browse/MON-3544
alert: 'NodeClockNotSynchronising',
expr: function(o)
std.format('(\n%(expr)s) and on() absent(up{job="ptp-monitor-service"})', o)
,
labels: {
severity: 'critical',
},
},
{
// See previous item.
alert: 'NodeClockSkewDetected',
expr: function(o)
std.format('(\n%(expr)s) and on() absent(up{job="ptp-monitor-service"})', o),
},
{
// Extend the upstream for duration to reduce alert noise.
alert: 'NodeSystemdServiceFailed',
'for': '15m',
},
Expand Down Expand Up @@ -320,6 +351,7 @@ local patchedRules = [
local kubernetesStorageConfig = { prefixedNamespaceSelector: 'namespace=~"(openshift-.*|kube-.*|default)",', kubeletSelector: 'job="kubelet", metrics_path="/metrics"' },
rules: [
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'KubePersistentVolumeErrors',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -348,25 +380,29 @@ local patchedRules = [
'for': '1h',
},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'PrometheusBadConfig',
labels: {
severity: 'warning',
},
},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'PrometheusRemoteStorageFailures',
labels: {
severity: 'warning',
},

},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'PrometheusRuleFailures',
labels: {
severity: 'warning',
},
},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'PrometheusRemoteWriteBehind',
labels: {
severity: 'info',
Expand All @@ -378,18 +414,21 @@ local patchedRules = [
name: 'thanos-rule',
rules: [
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'ThanosNoRuleEvaluations',
labels: {
severity: 'warning',
},
},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'ThanosRuleHighRuleEvaluationFailures',
labels: {
severity: 'warning',
},
},
{
// Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
alert: 'ThanosRuleSenderIsFailingAlerts',
labels: {
severity: 'warning',
Expand Down Expand Up @@ -472,19 +511,26 @@ local patchOrExcludeRule(rule, ruleSet, operation) =
if std.length(ruleSet) == 0 then
[rule]
else if ('severity' in ruleSet[0] && !std.startsWith(rule.labels.severity, ruleSet[0].severity)) then
// If the 'severity' field is set then it means "drop any alerting rule
// matching this name + severity label".
[] + patchOrExcludeRule(rule, ruleSet[1:], operation)
else if (('alert' in rule && 'alert' in ruleSet[0]) && std.startsWith(rule.alert, ruleSet[0].alert)) ||
(('record' in rule && 'record' in ruleSet[0]) && std.startsWith(rule.record, ruleSet[0].record)) then
if operation == 'patch' then
local patch = {
[k]: ruleSet[0][k]
[k]: if (std.isFunction(ruleSet[0][k])) then
ruleSet[0][k](rule[k])
else
ruleSet[0][k]
for k in std.objectFields(ruleSet[0])
if k != 'alert' && k != 'record'
};
[std.mergePatch(rule, patch)]
else
// action is 'exclude'.
[]
else
// Evaluate the next override.
[] + patchOrExcludeRule(rule, ruleSet[1:], operation);

local patchOrExcludeRuleGroup(group, groupSet, operation) =
Expand Down
80 changes: 80 additions & 0 deletions test/rules/NodeClockNotSynchronising.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
rule_files:
- rules.yaml

evaluation_interval: 1m

tests:
# When the PTP operator isn't installed, NodeClockNotSynchronising should
# become active when the conditions are met.
- interval: 1m
input_series:
# node with a zero sync status and maxerrors above threshold.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
values: 0x20 0
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
values: 16x20 0
# node with a zero sync status but acceptable maxerrors.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
values: 0x20
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
values: 15x20
# node with a non-zero sync status and increasing maxerrors.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
values: 1x20
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
values: 0+1x20
alert_rule_test:
- eval_time: 9m
alertname: NodeClockNotSynchronising
exp_alerts:
- eval_time: 10m
alertname: NodeClockNotSynchronising
exp_alerts:
- exp_labels:
severity: critical
namespace: openshift-monitoring
container: kube-rbac-proxy
endpoint: https
instance: ocp-master-0
job: node-exporter
pod: node-exporter-master-0
service: node-exporter
exp_annotations:
description: "Clock at ocp-master-0 is not synchronising. Ensure NTP is configured on this host."
runbook_url: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md"
summary: "Clock not synchronising."
- eval_time: 21m
alertname: NodeClockNotSynchronising
exp_alerts:

# When the PTP operator is installed, NodeClockNotSynchronising should
# never become active, even when the conditions are met.
- interval: 1m
input_series:
- series: 'up{job="ptp-monitor-service"}'
values: 1x40
# node with a zero sync status and maxerrors above threshold.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
values: 0x20 0
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
values: 16x20 0
# node with a zero sync status but acceptable maxerrors.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
values: 0x20
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
values: 15x20
# node with a non-zero sync status and increasing maxerrors.
- series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
values: 1x20
- series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
values: 0+1x20
alert_rule_test:
- eval_time: 9m
alertname: NodeClockNotSynchronising
exp_alerts:
- eval_time: 10m
alertname: NodeClockNotSynchronising
exp_alerts:
- eval_time: 21m
alertname: NodeClockNotSynchronising
exp_alerts:

0 comments on commit dc2c689

Please sign in to comment.