Adjust NodeClock* alerting rules to work with PTP operator

This commit adapts the upstream NodeClockNotSynchronising and NodeClockSkewDetected rules to be always inactive when the PTP operator is installed. The PTP operator ships a more robust rule to detect unsynchronised clocks and the default rules are redundant in this case. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
openshift · Dec 6, 2023 · dc2c689 · dc2c689
1 parent 8377489
commit dc2c689
Show file tree

Hide file tree

Showing 5 changed files with 245 additions and 6 deletions.
diff --git a/assets/node-exporter/prometheus-rule.yaml b/assets/node-exporter/prometheus-rule.yaml
@@ -173,7 +173,8 @@ spec:
       annotations:
         description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
         summary: Clock skew detected.
-      expr: |
+      expr: |-
+        (
         (
           node_timex_offset_seconds{job="node-exporter"} > 0.05
         and
@@ -185,6 +186,7 @@ spec:
         and
           deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
         )
+        ) and on() absent(up{job="ptp-monitor-service"})
       for: 10m
       labels:
         severity: warning
@@ -193,10 +195,12 @@ spec:
         description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
         runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md
         summary: Clock not synchronising.
-      expr: |
+      expr: |-
+        (
         min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
         and
         node_timex_maxerror_seconds{job="node-exporter"} >= 16
+        ) and on() absent(up{job="ptp-monitor-service"})
       for: 10m
       labels:
         severity: critical

diff --git a/jsonnet/utils/sanitize-rules.libsonnet b/jsonnet/utils/sanitize-rules.libsonnet
@@ -1,5 +1,6 @@
 local k8sMixinUtils = import 'github.com/kubernetes-monitoring/kubernetes-mixin/lib/utils.libsonnet';
 
+// List of rule groups which are dropped from the final manifests.
 local excludedRuleGroups = [
   'kube-apiserver-availability.rules',
   // rules managed by openshift/cluster-kube-controller-manager-operator.
@@ -16,18 +17,23 @@ local excludedRuleGroups = [
   'kubernetes-system-kube-proxy',
 ];
 
+// List of rules which are dropped from the final manifests.
 local excludedRules = [
   {
     name: 'alertmanager.rules',
     rules: [
+      // Already covered by the KubePodCrashLooping alerting rules.
       { alert: 'AlertmanagerClusterCrashlooping' },
+      //
       { alert: 'AlertmanagerClusterFailedToSendAlerts', severity: 'warning' },
     ],
   },
   {
     name: 'general.rules',
     rules: [
+      // CMO ships a modified TargetDown alerting rule which is less noisy than upstream.
       { alert: 'TargetDown' },
+      // We decided not to ship the InfoInhibitor alerting rule for now.
       { alert: 'InfoInhibitor' },
     ],
   },
@@ -74,6 +80,7 @@ local excludedRules = [
       // In addition we have alerts to detect that a Kubelet
       // can't renew its certificates which makes it redundant
       // to alert on certificates being almost expired.
+      //
       // See https://coreos.slack.com/archives/CB48XQ4KZ/p1603712568136500.
       { alert: 'KubeletClientCertificateExpiration' },
       { alert: 'KubeletServerCertificateExpiration' },
@@ -82,16 +89,25 @@ local excludedRules = [
   {
     name: 'kubernetes-apps',
     rules: [
+      // We ship a modified KubeDeploymentReplicasMismatch alerting rule which
+      // takes into account the availability of the control plane nodes.
       { alert: 'KubeDeploymentReplicasMismatch' },
     ],
   },
   {
     name: 'prometheus',
     rules: [
+      // PrometheusErrorSendingAlertsToAnyAlertmanager has a critical severity but it
+      // can be noisy and we prefer to rely on the Watchdog alerting rule to detect
+      // broken communication between Prometheus and Alertmanager.  We keep the
+      // PrometheusErrorSendingAlertsToSomeAlertmanagers alerting rule with the
+      // warning severity to help with root cause.
+      //
+      // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
       { alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager' },
     ],
   },
-  // The following rules are removed due to lack of usefulness
+  // The following recording rules are removed due to lack of usefulness
   // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1996785 for details.
   {
     name: 'kube-prometheus-node-recording.rules',
@@ -116,6 +132,7 @@ local excludedRules = [
   {
     name: 'thanos-query',
     rules: [
+      // We have no SLO on the Thanos querier API service.
       { alert: 'ThanosQueryInstantLatencyHigh' },
       { alert: 'ThanosQueryRangeLatencyHigh' },
     ],
@@ -180,12 +197,26 @@ local patchedRules = [
     name: 'node-exporter',
     rules: [
       {
+        // When the PTP operator is installed, it provides a more reliable and accurate
+        // alerting rule to detect unsynchronized clocks. The NodeClockNotSynchronising
+        // alerting rule is patched to never become active in this case.
+        // See https://issues.redhat.com/browse/MON-3544
         alert: 'NodeClockNotSynchronising',
+        expr: function(o)
+          std.format('(\n%(expr)s) and on() absent(up{job="ptp-monitor-service"})', o)
+        ,
         labels: {
           severity: 'critical',
         },
       },
       {
+        // See previous item.
+        alert: 'NodeClockSkewDetected',
+        expr: function(o)
+          std.format('(\n%(expr)s) and on() absent(up{job="ptp-monitor-service"})', o),
+      },
+      {
+        // Extend the upstream for duration to reduce alert noise.
         alert: 'NodeSystemdServiceFailed',
         'for': '15m',
       },
@@ -320,6 +351,7 @@ local patchedRules = [
     local kubernetesStorageConfig = { prefixedNamespaceSelector: 'namespace=~"(openshift-.*|kube-.*|default)",', kubeletSelector: 'job="kubelet", metrics_path="/metrics"' },
     rules: [
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'KubePersistentVolumeErrors',
         labels: {
           severity: 'warning',
@@ -348,25 +380,29 @@ local patchedRules = [
         'for': '1h',
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'PrometheusBadConfig',
         labels: {
           severity: 'warning',
         },
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'PrometheusRemoteStorageFailures',
         labels: {
           severity: 'warning',
         },
 
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'PrometheusRuleFailures',
         labels: {
           severity: 'warning',
         },
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'PrometheusRemoteWriteBehind',
         labels: {
           severity: 'info',
@@ -378,18 +414,21 @@ local patchedRules = [
     name: 'thanos-rule',
     rules: [
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'ThanosNoRuleEvaluations',
         labels: {
           severity: 'warning',
         },
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'ThanosRuleHighRuleEvaluationFailures',
         labels: {
           severity: 'warning',
         },
       },
       {
+        // Refer to https://bugzilla.redhat.com/show_bug.cgi?id=1986981 for details.
         alert: 'ThanosRuleSenderIsFailingAlerts',
         labels: {
           severity: 'warning',
@@ -472,19 +511,26 @@ local patchOrExcludeRule(rule, ruleSet, operation) =
   if std.length(ruleSet) == 0 then
     [rule]
   else if ('severity' in ruleSet[0] && !std.startsWith(rule.labels.severity, ruleSet[0].severity)) then
+    // If the 'severity' field is set then it means "drop any alerting rule
+    // matching this name + severity label".
     [] + patchOrExcludeRule(rule, ruleSet[1:], operation)
   else if (('alert' in rule && 'alert' in ruleSet[0]) && std.startsWith(rule.alert, ruleSet[0].alert)) ||
           (('record' in rule && 'record' in ruleSet[0]) && std.startsWith(rule.record, ruleSet[0].record)) then
     if operation == 'patch' then
       local patch = {
-        [k]: ruleSet[0][k]
+        [k]: if (std.isFunction(ruleSet[0][k])) then
+          ruleSet[0][k](rule[k])
+        else
+          ruleSet[0][k]
         for k in std.objectFields(ruleSet[0])
         if k != 'alert' && k != 'record'
       };
       [std.mergePatch(rule, patch)]
     else
+      // action is 'exclude'.
       []
   else
+    // Evaluate the next override.
     [] + patchOrExcludeRule(rule, ruleSet[1:], operation);
 
 local patchOrExcludeRuleGroup(group, groupSet, operation) =

diff --git a/test/rules/NodeClockNotSynchronising.yaml b/test/rules/NodeClockNotSynchronising.yaml
@@ -0,0 +1,80 @@
+rule_files:
+  - rules.yaml
+
+evaluation_interval: 1m
+
+tests:
+  # When the PTP operator isn't installed, NodeClockNotSynchronising should
+  # become active when the conditions are met.
+  - interval: 1m
+    input_series:
+      # node with a zero sync status and maxerrors above threshold.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
+        values: 0x20 0
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
+        values: 16x20 0
+      # node with a zero sync status but acceptable maxerrors.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
+        values: 0x20
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
+        values: 15x20
+      # node with a non-zero sync status and increasing maxerrors.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
+        values: 1x20
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
+        values: 0+1x20
+    alert_rule_test:
+      - eval_time: 9m
+        alertname: NodeClockNotSynchronising
+        exp_alerts:
+      - eval_time: 10m
+        alertname: NodeClockNotSynchronising
+        exp_alerts:
+        - exp_labels:
+            severity: critical
+            namespace: openshift-monitoring
+            container: kube-rbac-proxy
+            endpoint: https
+            instance: ocp-master-0
+            job: node-exporter
+            pod: node-exporter-master-0
+            service: node-exporter
+          exp_annotations:
+            description: "Clock at ocp-master-0 is not synchronising. Ensure NTP is configured on this host."
+            runbook_url: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md"
+            summary: "Clock not synchronising."
+      - eval_time: 21m
+        alertname: NodeClockNotSynchronising
+        exp_alerts:
+
+  # When the PTP operator is installed, NodeClockNotSynchronising should
+  # never become active, even when the conditions are met.
+  - interval: 1m
+    input_series:
+      - series: 'up{job="ptp-monitor-service"}'
+        values: 1x40
+      # node with a zero sync status and maxerrors above threshold.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
+        values: 0x20 0
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-0",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-0",service="node-exporter"}'
+        values: 16x20 0
+      # node with a zero sync status but acceptable maxerrors.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
+        values: 0x20
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-1",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-1",service="node-exporter"}'
+        values: 15x20
+      # node with a non-zero sync status and increasing maxerrors.
+      - series: 'node_timex_sync_status{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
+        values: 1x20
+      - series: 'node_timex_maxerror_seconds{container="kube-rbac-proxy",endpoint="https",instance="ocp-master-2",job="node-exporter",namespace="openshift-monitoring",pod="node-exporter-master-2",service="node-exporter"}'
+        values: 0+1x20
+    alert_rule_test:
+      - eval_time: 9m
+        alertname: NodeClockNotSynchronising
+        exp_alerts:
+      - eval_time: 10m
+        alertname: NodeClockNotSynchronising
+        exp_alerts:
+      - eval_time: 21m
+        alertname: NodeClockNotSynchronising
+        exp_alerts: