From d7aa745662805bb4921235317b49f8cd982dcaa0 Mon Sep 17 00:00:00 2001 From: Bastian Eicher Date: Wed, 17 Jan 2024 12:00:41 +0100 Subject: [PATCH] Added alerting.restarts.enabled --- charts/generic-service/README.md | 1 + charts/generic-service/templates/alerts.yaml | 2 ++ charts/generic-service/values.schema.json | 10 ++++++++++ charts/generic-service/values.yaml | 2 ++ 4 files changed, 15 insertions(+) diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index d867e3c..1f9399a 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -141,6 +141,7 @@ app: | `alerting.enabled` | `false` | Deploys Prometheus alert rule for issues like like unavailable pods or high memory use | | `alerting.pod.maxStartupSeconds` | `120` | The maximum amount of time a Pod is allowed to take for startup | | `alerting.pod.maxAgeSeconds` | | The maximum allowed age of a `Pod` in seconds (useful to ensure regular deployments) | +| `alerting.restarts.enabled` | `true` | Deploys Prometheus alert rule for unexpected container restarts | | `alerting.memory.enabled` | `true` | Enables alerts relating to memory usage | | `alerting.memory.maxUsageFactor` | `0.9` | The maximum usage factor of the memory limit (between `0` and `1`) | | `alerting.memory.quotaBufferFactor` | `1.0` | Multiplied with `resources.*.memory` to determine minimum allowed unused memory quota in namespace | diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index 7d41864..5785f8f 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -45,6 +45,7 @@ spec: {{- end }} {{- end }} + {{- if .Values.alerting.restarts.enabled }} - alert: ContainerRestart # Avoid constantly retriggering during crash loops by comparing over interval slightly longer than CrashLoopBackOff upper limit (5m). # Don't trigger during startup grace period (service might just be waiting for dependencies). @@ -56,6 +57,7 @@ spec: topic: availability annotations: {{- include "generic-service.alert-annotations" . | nindent 12 }} crash/restart description: '{{"{{ $labels.pod }}"}} has crashed/restarted.' + {{- end }} {{- if .Values.alerting.pod.maxAgeSeconds }} - alert: PodTooOld diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index d73a59d..d7b7f9b 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -784,6 +784,16 @@ }, "additionalProperties": false }, + "restarts": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "default": true, + "description": "Deploys Prometheus alert rule for unexpected container restarts" + } + } + }, "memory": { "type": "object", "properties": { diff --git a/charts/generic-service/values.yaml b/charts/generic-service/values.yaml index b27cd5f..7fd2578 100644 --- a/charts/generic-service/values.yaml +++ b/charts/generic-service/values.yaml @@ -154,6 +154,8 @@ alerting: pod: maxStartupSeconds: 120 maxAgeSeconds: + restarts: + enabled: true memory: enabled: true maxUsageFactor: 0.9