Skip to content

Commit

Permalink
Merge pull request #1317 from raptorsun/bugfix/bz1986981
Browse files Browse the repository at this point in the history
Bug 1986981: Alert Config update - Patch to PR#1310
  • Loading branch information
openshift-merge-robot committed Aug 21, 2021
2 parents 2ca9e0f + 1b228da commit ec61f05
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 164 deletions.
2 changes: 1 addition & 1 deletion assets/control-plane/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ spec:
) < 0.03
and
kubelet_volume_stats_used_bytes{namespace=~"(openshift-.*|kube-.*|default)",job="kubelet", metrics_path="/metrics"} > 0
for: 5m
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFillingUp
Expand Down
4 changes: 2 additions & 2 deletions assets/node-exporter/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ spec:
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 2*60*60) < 0
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
Expand Down Expand Up @@ -103,7 +103,7 @@ spec:
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 2*60*60) < 0
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
Expand Down
14 changes: 7 additions & 7 deletions assets/prometheus-k8s/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ spec:
)
)
* 100
> 10
> 1
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -252,25 +252,25 @@ spec:
summary: Thanos Sidecar cannot connect to Prometheus
expr: |
thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0
for: 5m
for: 1h
labels:
severity: critical
severity: warning
- alert: ThanosSidecarBucketOperationsFailed
annotations:
description: Thanos Sidecar {{$labels.instance}} bucket operations are failing
summary: Thanos Sidecar bucket operations are failing
expr: |
sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m])) > 0
for: 5m
for: 1h
labels:
severity: critical
severity: warning
- alert: ThanosSidecarUnhealthy
annotations:
description: Thanos Sidecar {{$labels.instance}} is unhealthy for more than
{{$value}} seconds.
summary: Thanos Sidecar is unhealthy.
expr: |
time() - max by (job, instance) (thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}) >= 240
for: 5m
for: 1h
labels:
severity: critical
severity: warning
14 changes: 7 additions & 7 deletions assets/thanos-querier/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ spec:
/
sum by (job) (rate(http_requests_total{job="thanos-querier", handler="query"}[5m]))
) * 100 > 5
for: 5m
for: 1h
labels:
severity: critical
severity: warning
- alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations:
description: Thanos Query {{$labels.job}} is failing to handle {{$value |
Expand All @@ -37,9 +37,9 @@ spec:
/
sum by (job) (rate(http_requests_total{job="thanos-querier", handler="query_range"}[5m]))
) * 100 > 5
for: 5m
for: 1h
labels:
severity: critical
severity: warning
- alert: ThanosQueryGrpcServerErrorRate
annotations:
description: Thanos Query {{$labels.job}} is failing to handle {{$value |
Expand All @@ -52,7 +52,7 @@ spec:
sum by (job) (rate(grpc_server_started_total{job="thanos-querier"}[5m]))
* 100 > 5
)
for: 5m
for: 1h
labels:
severity: warning
- alert: ThanosQueryGrpcClientErrorRate
Expand All @@ -66,7 +66,7 @@ spec:
/
sum by (job) (rate(grpc_client_started_total{job="thanos-querier"}[5m]))
) * 100 > 5
for: 5m
for: 1h
labels:
severity: warning
- alert: ThanosQueryHighDNSFailures
Expand All @@ -80,6 +80,6 @@ spec:
/
sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job="thanos-querier"}[5m]))
) * 100 > 1
for: 15m
for: 1h
labels:
severity: warning
157 changes: 10 additions & 147 deletions jsonnet/utils/sanitize-rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -152,78 +152,6 @@ local patchedRules = [
severity: 'warning',
},
},
{
alert: 'KubePersistentVolumeFillingUp',
expr: |||
(
kubelet_volume_stats_available_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
/
kubelet_volume_stats_capacity_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s}
) < 0.03
and
kubelet_volume_stats_used_bytes{%(prefixedNamespaceSelector)s%(kubeletSelector)s} > 0
||| % kubernetesStorageConfig,
'for': '5m',
labels: {
severity: 'critical',
},
},
{
alert: 'KubePersistentVolumeFillingUp',
labels: {
severity: 'warning',
},
},
],
},
{
name: 'node-exporter',
local nodeExporterConfig = { nodeExporterSelector: 'job="node-exporter"', fsSelector: 'fstype!=""', fsSpaceFillingUpCriticalThreshold: 15 },
rules: [
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 2*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % nodeExporterConfig,
'for': '1h',
labels: {
severity: 'critical',
},
},
{
alert: 'NodeFilesystemSpaceFillingUp',
labels: {
severity: 'warning',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 20
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 2*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
||| % nodeExporterConfig,
'for': '1h',
labels: {
severity: 'critical',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
labels: {
severity: 'warning',
},
},
],
},
{
Expand All @@ -245,20 +173,6 @@ local patchedRules = [
},
{
alert: 'PrometheusRemoteStorageFailures',
expr: |||
(
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) or rate(prometheus_remote_storage_samples_failed_total{%(prometheusSelector)s}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) or rate(prometheus_remote_storage_samples_failed_total{%(prometheusSelector)s}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m]) or rate(prometheus_remote_storage_samples_total{%(prometheusSelector)s}[5m]))
)
)
* 100
> 10
||| % { prometheusSelector: 'job=~"prometheus-k8s|prometheus-user-workload"' },
'for': '15m',
labels: {
severity: 'warning',
},
Expand Down Expand Up @@ -341,70 +255,19 @@ local removeRunbookUrl(rule) = rule {
local patchOrExcludeRule(rule, ruleSet, operation) =
if std.length(ruleSet) == 0 then
[rule]
else if ('alert' in rule) then
local matchedRules = std.filter(function(ruleItem) ('alert' in ruleItem) && (ruleItem.alert == rule.alert), ruleSet);
local matchedRulesSeverity = std.filter(function(ruleItem) if ('labels' in ruleItem) && ('severity' in ruleItem.labels) then ruleItem.labels.severity == rule.labels.severity else false, matchedRules);

if std.length(matchedRules) > 1 && std.length(matchedRulesSeverity) >= 1 then
local targetRule = matchedRulesSeverity[0];
if operation == 'patch' then
local patch = {
[k]: targetRule[k]
for k in std.objectFields(targetRule)
if k != 'alert' && k != 'record'
};
[std.mergePatch(rule, patch)]
else if operation == 'exclude' then
[]
else
assert false : 'operation not support ' + operation;
[]

else if std.length(matchedRules) > 1 && std.length(matchedRulesSeverity) == 0 then
assert false : 'Duplicated patch rules without matching severity for rule: ' + std.toString(rule);
[]
else if std.length(matchedRules) == 1 && std.length(matchedRulesSeverity) <= 1 then
local targetRule = matchedRules[0];
if operation == 'patch' then
local patch = {
[k]: targetRule[k]
for k in std.objectFields(targetRule)
if k != 'alert' && k != 'record'
};
[std.mergePatch(rule, patch)]
else if operation == 'exclude' then
[]
else
assert false : 'operation not support ' + operation;
[]

else if (('alert' in rule && 'alert' in ruleSet[0]) && std.startsWith(rule.alert, ruleSet[0].alert)) ||
(('record' in rule && 'record' in ruleSet[0]) && std.startsWith(rule.record, ruleSet[0].record)) then
if operation == 'patch' then
local patch = {
[k]: ruleSet[0][k]
for k in std.objectFields(ruleSet[0])
if k != 'alert' && k != 'record'
};
[std.mergePatch(rule, patch)]
else
[rule]
else if ('record' in rule) then

local matchedRules = std.filter(function(ruleItem) ('record' in ruleItem) && (ruleItem.record == rule.record), ruleSet);

if std.length(matchedRules) == 1 then
local targetRule = matchedRules[0];
if operation == 'patch' then
local patch = {
[k]: targetRule[k]
for k in std.objectFields(targetRule)
if k != 'alert' && k != 'record'
};
[std.mergePatch(rule, patch)]
else
[]
else if std.length(matchedRules) > 1 then
assert false : 'Duplicated patch for record rules: ' + std.toString(rule) + ' matching patches: ' + std.toString(matchedRules);
[]
else
[rule]

else
// neither alert nor record rule, leave it as is
[rule];

[] + patchOrExcludeRule(rule, ruleSet[1:], operation);

local patchOrExcludeRuleGroup(group, groupSet, operation) =
if std.length(groupSet) == 0 then
Expand Down

0 comments on commit ec61f05

Please sign in to comment.