diff --git a/charts/generic-service/README.md b/charts/generic-service/README.md index 35bf7d3..1dff9c8 100644 --- a/charts/generic-service/README.md +++ b/charts/generic-service/README.md @@ -154,14 +154,15 @@ app: | `alerting.cpu.maxThrottleFactor` | `0.01` | The maximum fraction of the container's execution time during which it experiences CPU throttling | | `alerting.cpu.quotaBufferFactor` | `1.0` | Multiplied with `resources.*.cpu` to determine minimum allowed unused CPU quota in namespace | | `alerting.http.sampleInterval` | `20m` | The time interval in which to measure HTTP responses for triggering alerts | -| `alerting.http.referenceInterval` | `1w` | The time interval to to compare with the sample interval to detect changes | +| `alerting.http.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes | | `alerting.http.maxSlowdown` | `2.5` | The maximum HTTP response slowdown in the sample interval compared to the reference interval | | `alerting.http.max4xxRatio` | `2.5` | The maximum HTTP 4xx ratio increase in the sample interval compared to the reference interval | | `alerting.http.max5xxCount` | `0` | The maximum number of HTTP 5xx responses (except 504) in the sample interval | | `alerting.http.maxTimeoutCount` | `0` | The maximum number of HTTP gateway timeout responses (504) in the sample interval | | `alerting.grpc.requestsMetric` | `grpc_server_handled_total` | The name of the Prometheus metric counting gRPC requests | +| `alerting.grpc.ignoreErrorCodes` | `[]` | Which non-successful gRPC status codes will be ignored for triggering alerts | | `alerting.grpc.sampleInterval` | `20m` | The time interval in which to measure gRPC responses | -| `alerting.grpc.referenceInterval` | `1w` | The time interval to to compare with the sample interval to detect changes | +| `alerting.grpc.referenceInterval` | `1w` | The time interval to compare with the sample interval to detect changes | | `alerting.grpc.maxErrorRatio` | `2.5` | The maximum gRPC error ratio increase in the sample interval compared to the reference interval | | `alerting.grpc.errorDuration` | | The duration for which the gRPC error rate has to remain elevated before triggering an alert | | `alerting.grpc.maxCriticalErrors` | `0` | The maximum number of critical gRPC errors responses in the sample interval | diff --git a/charts/generic-service/templates/alerts.yaml b/charts/generic-service/templates/alerts.yaml index d957986..ddba0dd 100644 --- a/charts/generic-service/templates/alerts.yaml +++ b/charts/generic-service/templates/alerts.yaml @@ -197,10 +197,11 @@ spec: {{- if or (eq .Values.ingress.protocol "grpc") (eq .Values.ingress.protocol "grpcs") }} {{- if .Values.alerting.grpc.referenceInterval }} + {{ $ignoreCodes := prepend .Values.alerting.grpc.ignoreErrorCodes "OK" }} - alert: GrpcErrors expr: | - (sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}",grpc_code!="OK"}[{{ .Values.alerting.grpc.sampleInterval }}])) / sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}"}[{{ .Values.alerting.grpc.sampleInterval }}]))) / - (sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}",grpc_code!="OK"}[{{ .Values.alerting.grpc.referenceInterval }}])) / sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}"}[{{ .Values.alerting.grpc.referenceInterval }}]))) + (sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}",grpc_code!~"{{ $ignoreCodes | join "|" }}"}[{{ .Values.alerting.grpc.sampleInterval }}])) / sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}"}[{{ .Values.alerting.grpc.sampleInterval }}]))) / + (sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}",grpc_code!~"{{ $ignoreCodes | join "|" }}"}[{{ .Values.alerting.grpc.referenceInterval }}])) / sum(rate({{ .Values.alerting.grpc.requestsMetric }}{namespace="{{ .Release.Namespace }}",release="{{ .Release.Name }}"}[{{ .Values.alerting.grpc.referenceInterval }}]))) > {{ .Values.alerting.grpc.maxErrorRatio }} {{- if .Values.alerting.grpc.errorDuration }} for: {{ .Values.alerting.grpc.errorDuration }} diff --git a/charts/generic-service/values.schema.json b/charts/generic-service/values.schema.json index c79e4c9..ed434f7 100644 --- a/charts/generic-service/values.schema.json +++ b/charts/generic-service/values.schema.json @@ -907,6 +907,12 @@ "default": "grpc_server_handled_total", "description": "The name of the Prometheus metric counting gRPC requests" }, + "ignoreErrorCodes": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "description": "Which non-successful gRPC status codes will be ignored for triggering alerts" + }, "sampleInterval": { "type": "string", "default": "15m", diff --git a/charts/generic-service/values.yaml b/charts/generic-service/values.yaml index 8a19b88..32e5c4f 100644 --- a/charts/generic-service/values.yaml +++ b/charts/generic-service/values.yaml @@ -176,6 +176,7 @@ alerting: maxTimeoutCount: 0 grpc: requestsMetric: grpc_server_handled_total + ignoreErrorCodes: [] sampleInterval: 20m referenceInterval: 1w maxErrorRatio: 2.5