From bd76dc296e92f5723b682063383bc516139d5469 Mon Sep 17 00:00:00 2001 From: libander Date: Thu, 4 May 2023 10:11:53 -0500 Subject: [PATCH] RHDEVDOCS-4430 - w/ peer rev & qe feedback --- logging/v5_6/logging-5-6-configuration.adoc | 2 + logging/v5_7/logging-5-7-configuration.adoc | 8 +++ modules/logging-loki-alerts.adoc | 56 +++++++++++++++++++ ...ogging-alertingrule-app-callouts-snip.adoc | 31 ++++++++++ ...ogging-alertingrule-inf-callouts-snip.adoc | 35 ++++++++++++ .../logging-alertingruleCR-callouts-snip.adoc | 37 ++++++++++++ snippets/logging-alertingruleCR-snip.adoc | 33 +++++++++++ 7 files changed, 202 insertions(+) create mode 100644 modules/logging-loki-alerts.adoc create mode 100644 snippets/logging-alertingrule-app-callouts-snip.adoc create mode 100644 snippets/logging-alertingrule-inf-callouts-snip.adoc create mode 100644 snippets/logging-alertingruleCR-callouts-snip.adoc create mode 100644 snippets/logging-alertingruleCR-snip.adoc diff --git a/logging/v5_6/logging-5-6-configuration.adoc b/logging/v5_6/logging-5-6-configuration.adoc index 8bc6c5184ca0..302457bc9454 100644 --- a/logging/v5_6/logging-5-6-configuration.adoc +++ b/logging/v5_6/logging-5-6-configuration.adoc @@ -9,3 +9,5 @@ toc::[] include::snippets/logging-crs-by-operator-snip.adoc[] include::snippets/logging-supported-config-snip.adoc[] + +include::modules/logging-loki-retention.adoc[leveloffset=+1] diff --git a/logging/v5_7/logging-5-7-configuration.adoc b/logging/v5_7/logging-5-7-configuration.adoc index 02cc5bd29422..6f7b167eb9cc 100644 --- a/logging/v5_7/logging-5-7-configuration.adoc +++ b/logging/v5_7/logging-5-7-configuration.adoc @@ -9,3 +9,11 @@ toc::[] include::snippets/logging-crs-by-operator-snip.adoc[] include::snippets/logging-supported-config-snip.adoc[] + +include::modules/logging-loki-retention.adoc[leveloffset=+1] + +//include::modules/logging-loki-alerts.adoc[leveloffset=+1] + +//[role="_additional-resources"] +//.Additional resources +//* xref:../../monitoring/enabling-alert-routing-for-user-defined-projects.html#enabling-a-separate-alertmanager-instance-for-user-defined-alert-routing_enabling-alert-routing-for-user-defined-projects[Enabling a separate alertmanager instance] diff --git a/modules/logging-loki-alerts.adoc b/modules/logging-loki-alerts.adoc new file mode 100644 index 000000000000..770a603cbfce --- /dev/null +++ b/modules/logging-loki-alerts.adoc @@ -0,0 +1,56 @@ +// Module included in the following assemblies: +// logging-5-7-configuration + +:_content-type: PROCEDURE +[id="logging-loki-alerts_{context}"] += Enabling log based alerts with Loki +Loki alerting rules use link:https://grafana.com/docs/loki/latest/logql/[LogQL] and follow link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules[Prometheus formatting]. You can set log based alerts by creating an `AlertingRule` custom resource (CR). `AlertingRule` CRs may be created for `application`, `audit`, or `infrastructure` tenants. + +[options="header"] +|================================================ +| Tenant type | Valid namespaces +| application | +| audit | `openshift-logging` +| infrastructure | `openshift-/*`, `kube-/*`, `default` +|================================================ + +Application, Audit, and Infrastructure alerts are sent to the Cluster Monitoring Operator (CMO) Alertmanager in the `openshift-monitoring` namespace by default unless you have disabled the local `Alertmanager` instance. + +Application alerts are not sent to the CMO Alertmanager in the `openshift-user-workload-monitoring` namespace by default unless you have enabled a separate `Alertmanager` instance. + +The `AlertingRule` CR contains a set of specifications and webhook validation definitions to declare groups of alerting rules for a single LokiStack instance. In addition, the webhook validation definition provides support for rule validation conditions: + +* If an `AlertingRule` CR includes an invalid `interval` period, it is an invalid alerting rule +* If an `AlertingRule` CR includes an invalid `for` period, it is an invalid alerting rule. +* If an `AlertingRule` CR includes an invalid LogQL `expr`, it is an invalid alerting rule. +* If an `AlertingRule` CR includes two groups with the same name, it is an invalid alerting rule. +* If none of above applies, an `AlertingRule` is considered a valid alerting rule. + +.Prerequisites + +* {logging-title-uc} Operator 5.7 and later +* {product-title} 4.13 and later + +.Procedure + +1. Create an AlertingRule CR: + +-- +include::snippets/logging-create-apply-cr-snip.adoc[lines=9..12] +-- + +2. Populate your AlertingRule CR using the appropriate example below: + +-- +include::snippets/logging-alertingrule-inf-callouts-snip.adoc[] +-- + +-- +include::snippets/logging-alertingrule-app-callouts-snip.adoc[] +-- + +3. Apply the CR. + +-- +include::snippets/logging-create-apply-cr-snip.adoc[lines=14..17] +-- diff --git a/snippets/logging-alertingrule-app-callouts-snip.adoc b/snippets/logging-alertingrule-app-callouts-snip.adoc new file mode 100644 index 000000000000..5c3d0113d951 --- /dev/null +++ b/snippets/logging-alertingrule-app-callouts-snip.adoc @@ -0,0 +1,31 @@ +.Example application AlertingRule CR +[source,yaml] +---- + apiVersion: loki.grafana.com/v1 + kind: AlertingRule + metadata: + name: app-user-workload + namespace: app-ns <1> + labels: <2> + openshift.io/cluster-monitoring: "true" + spec: + tenantID: "application" + groups: + - name: AppUserWorkloadHighError + rules: + - alert: + expr: | <3> + sum(rate({kubernetes_namespace_name="app-ns", kubernetes_pod_name=~"podName.*"} |= "error" [1m])) by (job) + for: 10s + labels: + severity: critical <4> + annotations: + summary: <5> + description: <6> +---- +<1> The `namespace` where this AlertingRule is created must have a label matching the LokiStack `spec.rules.namespaceSelector` definition. +<2> The `labels` block must match the LokiStack `spec.rules.selector` definition. +<3> Value for `kubernetes_namespace_name:` must match the value for `metadata.namespace`. +<4> Mandatory field. Must be `critical`, `warning`, or `info`. +<5> Mandatory field. Summary of the rule. +<6> Mandatory field. Detailed description of the rule. diff --git a/snippets/logging-alertingrule-inf-callouts-snip.adoc b/snippets/logging-alertingrule-inf-callouts-snip.adoc new file mode 100644 index 000000000000..59875b6cf669 --- /dev/null +++ b/snippets/logging-alertingrule-inf-callouts-snip.adoc @@ -0,0 +1,35 @@ +.Example infrastructure AlertingRule CR +[source,yaml] +---- + apiVersion: loki.grafana.com/v1 + kind: AlertingRule + metadata: + name: loki-operator-alerts + namespace: openshift-operators-redhat <1> + labels: <2> + openshift.io/cluster-monitoring: "true" + spec: + tenantID: "infrastructure" <3> + groups: + - name: LokiOperatorHighReconciliationError + rules: + - alert: HighPercentageError + expr: | <4> + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"} |= "error" [1m])) by (job) + / + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"}[1m])) by (job) + > 0.01 + for: 10s + labels: + severity: critical <5> + annotations: + summary: High Loki Operator Reconciliation Errors <6> + description: High Loki Operator Reconciliation Errors <7> +---- +<1> The `namespace` where this AlertingRule is created must have a label matching the LokiStack `spec.rules.namespaceSelector` definition. +<2> The `labels` block must match the LokiStack `spec.rules.selector` definition. +<3> AlertingRules for `infrastructure` tenants are only supported in the `openshift-\*`, `kube-\*`, or `default` namespaces. +<4> Value for `kubernetes_namespace_name:` must match the value for `metadata.namespace`. +<5> Mandatory field. Must be `critical`, `warning`, or `info`. +<6> Mandatory field. +<7> Mandatory field. diff --git a/snippets/logging-alertingruleCR-callouts-snip.adoc b/snippets/logging-alertingruleCR-callouts-snip.adoc new file mode 100644 index 000000000000..9ec8e35fb430 --- /dev/null +++ b/snippets/logging-alertingruleCR-callouts-snip.adoc @@ -0,0 +1,37 @@ +.Example AlertingRule CR +[source,yaml] +---- + apiVersion: loki.grafana.com/v1 + kind: AlertingRule + metadata: + name: loki-operator-alerts + namespace: openshift-operators-redhat <1> + labels: <2> + openshift.io/cluster-monitoring: "true" + spec: + tenantID: "infrastructure" <3> <4> <5> + groups: + - name: LokiOperatorHighReconciliationError + rules: + - alert: HighPercentageError + expr: | <6> + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"} |= "error" [1m])) by (job) + / + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"}[1m])) by (job) + > 0.01 + for: 10s + labels: + severity: critical <7> + annotations: + summary: High Loki Operator Reconciliation Errors <8> + description: High Loki Operator Reconciliation Errors <9> +---- +<1> The `namespace` where this AlertingRule is created must have a label matching the LokiStack `spec.rules.namespaceSelector` definition. +<2> The `labels` block must match the LokiStack `spec.rules.selector` definition. +<3> Must be `application`, `infrastructure`, or `audit`. +<4> AlertingRules for `infrastructure` tenants are only supported in the `openshift-\*`, `kube-\*`, or `default` namespaces. +<5> AlertingRules for `audit` tenants are only supported in the `openshift-logging` namespace. +<6> Value for `kubernetes_namespace_name:` must match the value for `metadata.namespace`. +<7> Mandatory field. Must be `critical`, `warning`, or `info`. +<8> Mandatory field. +<9> Mandatory field. diff --git a/snippets/logging-alertingruleCR-snip.adoc b/snippets/logging-alertingruleCR-snip.adoc new file mode 100644 index 000000000000..550f23eb365e --- /dev/null +++ b/snippets/logging-alertingruleCR-snip.adoc @@ -0,0 +1,33 @@ +// Text snippet included in the following assemblies: +// Text snippet included in the following modules: + +:_content-type: SNIPPET + +.Example AlertingRule CR +[source,yaml] +---- + apiVersion: loki.grafana.com/v1 + kind: AlertingRule + metadata: + name: loki-operator-alerts + namespace: openshift-operators-redhat + labels: + openshift.io/cluster-monitoring: "true" + spec: + tenantID: "infrastructure" + groups: + - name: LokiOperatorHighReconciliationError + rules: + - alert: HighPercentageError + expr: | + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"} |= "error" [1m])) by (job) + / + sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"}[1m])) by (job) + > 0.01 + for: 10s + labels: + severity: critical + annotations: + summary: High Loki Operator Reconciliation Errors + description: High Loki Operator Reconciliation Errors +----