Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate alerts for user workload monitoring prometheus #759

Merged
merged 2 commits into from
Apr 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 23 additions & 23 deletions assets/prometheus-k8s/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1635,7 +1635,7 @@ spec:
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) == 0
max_over_time(prometheus_config_last_reload_successful{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) == 0
for: 10m
labels:
severity: critical
Expand All @@ -1649,9 +1649,9 @@ spec:
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="openshift-monitoring"}[5m], 60 * 30)
predict_linear(prometheus_notifications_queue_length{job=~"prometheus-k8s|prometheus-user-workload"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
min_over_time(prometheus_notifications_queue_capacity{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
for: 15m
labels:
Expand All @@ -1664,9 +1664,9 @@ spec:
a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
* 100
> 1
Expand All @@ -1680,9 +1680,9 @@ spec:
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without(alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
* 100
> 3
Expand All @@ -1697,7 +1697,7 @@ spec:
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) < 1
max_over_time(prometheus_notifications_alertmanagers_discovered{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) < 1
for: 10m
labels:
severity: warning
Expand All @@ -1707,7 +1707,7 @@ spec:
{{$value | humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="openshift-monitoring"}[3h]) > 0
increase(prometheus_tsdb_reloads_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0
for: 4h
labels:
severity: warning
Expand All @@ -1717,7 +1717,7 @@ spec:
{{$value | humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="openshift-monitoring"}[3h]) > 0
increase(prometheus_tsdb_compactions_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0
for: 4h
labels:
severity: warning
Expand All @@ -1727,7 +1727,7 @@ spec:
samples.
summary: Prometheus is not ingesting samples.
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) <= 0
rate(prometheus_tsdb_head_samples_appended_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) <= 0
for: 10m
labels:
severity: warning
Expand All @@ -1738,7 +1738,7 @@ spec:
timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) > 0
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0
for: 1h
labels:
severity: warning
Expand All @@ -1748,7 +1748,7 @@ spec:
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) > 0
rate(prometheus_target_scrapes_sample_out_of_order_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0
for: 1h
labels:
severity: warning
Expand All @@ -1760,12 +1760,12 @@ spec:
summary: Prometheus fails to send samples to remote storage.
expr: |
(
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
/
(
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
+
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
)
* 100
Expand All @@ -1783,9 +1783,9 @@ spec:
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
- on(job, instance) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
> 120
for: 15m
Expand All @@ -1795,17 +1795,17 @@ spec:
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards, which is more
than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="openshift-monitoring"}`
than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job=~"prometheus-k8s|prometheus-user-workload"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
max_over_time(prometheus_remote_storage_shards_desired{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="openshift-monitoring"}[5m])
max_over_time(prometheus_remote_storage_shards_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
)
for: 15m
labels:
Expand All @@ -1816,7 +1816,7 @@ spec:
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) > 0
increase(prometheus_rule_evaluation_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0
for: 15m
labels:
severity: critical
Expand All @@ -1826,7 +1826,7 @@ spec:
printf "%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="openshift-monitoring"}[5m]) > 0
increase(prometheus_rule_group_iterations_missed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0
for: 15m
labels:
severity: warning
Expand Down
2 changes: 2 additions & 0 deletions jsonnet/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +

namespaceSelector: 'namespace=~"(openshift-.*|kube-.*|default|logging)"',

prometheusSelector: 'job=~"prometheus-k8s|prometheus-user-workload"',

kubeletPodLimit: 250,

// Certificates are issued for 4h.
Expand Down
4 changes: 2 additions & 2 deletions pkg/manifests/bindata.go

Large diffs are not rendered by default.