-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
prometheus-alerts.yaml
29 lines (27 loc) · 1.07 KB
/
prometheus-alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
- alert: CeleryWorkerOffline
expr: flower_worker_online == 0
for: 2m
labels:
severity: critical
context: celery-worker
annotations:
summary: Celery worker offline
description: Celery worker {{ $labels.worker }} has been offline for more than 2 minutes.
- alert: TaskFailureRatioTooHigh
expr: (sum(avg_over_time(flower_events_total{type="task-failed"}[15m])) by (task) / sum(avg_over_time(flower_events_total{type=~"task-failed|task-succeeded"}[15m])) by (task)) * 100 > 1
for: 5m
labels:
severity: critical
context: celery-task
annotations:
summary: Task Failure Ratio Too High.
description: Average task failure ratio for task {{ $labels.task }} is {{ $value }}.
- alert: TaskPrefetchTimeTooHigh
expr: sum(avg_over_time(flower_task_prefetch_time_seconds[15m])) by (task, worker) > 1
for: 5m
labels:
severity: critical
context: celery-task
annotations:
summary: Average Task Prefetch Time Too High.
description: Average task prefetch time at worker for task {{ $labels.task }} and worker {{ $labels.worker }} is {{ $value }}.