openshift · abrennan89 · Oct 23, 2023 · Sep 27, 2023
diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml
@@ -2551,6 +2551,15 @@ Topics:
   File: cluster-logging-upgrading
 - Name: Viewing cluster dashboards
   File: cluster-logging-dashboards
+- Name: Logging alerts
+  Dir: logging_alerts
+  Topics:
+  - Name: Default logging alerts
+    File: default-logging-alerts
+  - Name: Custom logging alerts
+    File: custom-logging-alerts
+  - Name: Troubleshooting logging alerts
+    File: troubleshooting-logging-alerts
 - Name: Troubleshooting Logging
   Dir: troubleshooting
   Distros: openshift-enterprise,openshift-origin
@@ -2559,10 +2568,6 @@ Topics:
     File: cluster-logging-cluster-status
   - Name: Viewing the status of the log store
     File: cluster-logging-log-store-status
-  - Name: Understanding Logging alerts
-    File: cluster-logging-alerts
-  - Name: Troubleshooting for Critical Alerts
-    File: cluster-logging-troubleshooting-for-critical-alerts
 - Name: Uninstalling Logging
   File: cluster-logging-uninstall
 - Name: Exported fields

diff --git a/_topic_maps/_topic_map_osd.yml b/_topic_maps/_topic_map_osd.yml
@@ -700,17 +700,22 @@ Topics:
   File: cluster-logging-upgrading
 - Name: Viewing cluster dashboards
   File: cluster-logging-dashboards
+- Name: Logging alerts
+  Dir: logging_alerts
+  Topics:
+  - Name: Default logging alerts
+    File: default-logging-alerts
+  - Name: Custom logging alerts
+    File: custom-logging-alerts
+  - Name: Troubleshooting logging alerts
+    File: troubleshooting-logging-alerts
 - Name: Troubleshooting Logging
   Dir: troubleshooting
   Topics:
   - Name: Viewing Logging status
     File: cluster-logging-cluster-status
   - Name: Viewing the status of the log store
     File: cluster-logging-log-store-status
-  - Name: Understanding Logging alerts
-    File: cluster-logging-alerts
-  - Name: Troubleshooting for Critical Alerts
-    File: cluster-logging-troubleshooting-for-critical-alerts
 - Name: Uninstalling Logging
   File: cluster-logging-uninstall
 - Name: Exported fields

diff --git a/_topic_maps/_topic_map_rosa.yml b/_topic_maps/_topic_map_rosa.yml
@@ -869,17 +869,22 @@ Topics:
   File: cluster-logging-upgrading
 - Name: Viewing cluster dashboards
   File: cluster-logging-dashboards
+- Name: Logging alerts
+  Dir: logging_alerts
+  Topics:
+  - Name: Default logging alerts
+    File: default-logging-alerts
+  - Name: Custom logging alerts
+    File: custom-logging-alerts
+  - Name: Troubleshooting logging alerts
+    File: troubleshooting-logging-alerts
 - Name: Troubleshooting Logging
   Dir: troubleshooting
   Topics:
   - Name: Viewing Logging status
     File: cluster-logging-cluster-status
   - Name: Viewing the status of the log store
     File: cluster-logging-log-store-status
-  - Name: Understanding Logging alerts
-    File: cluster-logging-alerts
-  - Name: Troubleshooting for Critical Alerts
-    File: cluster-logging-troubleshooting-for-critical-alerts
 - Name: Uninstalling Logging
   File: cluster-logging-uninstall
 - Name: Exported fields

diff --git a/logging/logging_alerts/_attributes b/logging/logging_alerts/_attributes
@@ -0,0 +1 @@
+../../_attributes/
diff --git a/logging/logging_alerts/custom-logging-alerts.adoc b/logging/logging_alerts/custom-logging-alerts.adoc
@@ -0,0 +1,40 @@
+:_content-type: ASSEMBLY
+[id="custom-logging-alerts"]
+include::_attributes/common-attributes.adoc[]
+= Custom logging alerts
+:context: custom-logging-alerts
+
+toc::[]
+
+In logging 5.7 and later versions, users can configure the LokiStack deployment to produce customized alerts and recorded metrics. If you want to use customized link:https://grafana.com/docs/loki/latest/alert/[alerting and recording rules], you must enable the LokiStack ruler component.
+
+LokiStack log-based alerts and recorded metrics are triggered by providing link:https://grafana.com/docs/loki/latest/query/[LogQL] expressions to the ruler component. The Loki Operator manages a ruler that is optimized for the selected LokiStack size, which can be `1x.extra-small`, `1x.small`, or `1x.medium`.
+
+[NOTE]
+====
+The `1x.extra-small` size is not supported. It is for demonstration purposes only.
+====
+
+To provide these expressions, you must create an `AlertingRule` custom resource (CR) containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting rules], or a `RecordingRule` CR containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording rules].
+
+Administrators can configure log-based alerts or recorded metrics for `application`, `audit`, or `infrastructure` tenants. Users without administrator permissions can configure log-based alerts or recorded metrics for `application` tenants of the applications that they have access to.
+
+Application, audit, and infrastructure alerts are sent by default to the {product-title} monitoring stack Alertmanager in the `openshift-monitoring` namespace, unless you have disabled the local Alertmanager instance. If the Alertmanager that is used to monitor user-defined projects in the `openshift-user-workload-monitoring` namespace is enabled, application alerts are sent to the Alertmanager in this namespace by default.
+
+include::modules/configuring-logging-loki-ruler.adoc[leveloffset=+1]
+include::modules/loki-rbac-permissions.adoc[leveloffset=+1]
+
+[role="_additional-resources"]
+.Additional resources
+* xref:../../authentication/using-rbac.adoc#using-rbac[Using RBAC to define and apply permissions]
+
+include::modules/logging-enabling-loki-alerts.adoc[leveloffset=+1]
+
+[role="_additional-resources"]
+[id="additional-resources_custom-logging-alerts"]
+== Additional resources
+* xref:../../monitoring/monitoring-overview.adoc#about-openshift-monitoring_monitoring-overview[About {product-title} monitoring]
+ifdef::openshift-enterprise[]
+* xref:../../post_installation_configuration/configuring-alert-notifications.adoc#configuring-alert-notifications[Configuring alert notifications]
+endif::[]
+// maybe need an update to https://docs.openshift.com/container-platform/4.13/monitoring/monitoring-overview.html#default-monitoring-targets_monitoring-overview to talk about Loki and Vector now? Are these part of default monitoring?
diff --git a/logging/logging_alerts/default-logging-alerts.adoc b/logging/logging_alerts/default-logging-alerts.adoc
@@ -0,0 +1,21 @@
+:_content-type: ASSEMBLY
+[id="default-logging-alerts"]
+include::_attributes/common-attributes.adoc[]
+= Default logging alerts
+:context: default-logging-alerts
+
+toc::[]
+
+Logging alerts are installed as part of the Cluster Logging Operator installation. Alerts depend on metrics exported by the log collection and log storage backends. These metrics are enabled if you selected the option to *Enable operator recommended cluster monitoring on this namespace* when installing the Cluster Logging Operator. For more information about installing logging Operators, see xref:../../logging/cluster-logging-deploying#cluster-logging-deploy-console_cluster-logging-deploying[Installing the {logging-title} using the web console].
+
+Default logging alerts are sent to the {product-title} monitoring stack Alertmanager in the `openshift-monitoring` namespace, unless you have disabled the local Alertmanager instance.
+
+include::modules/monitoring-accessing-the-alerting-ui.adoc[leveloffset=+1]
+include::modules/logging-vector-collector-alerts.adoc[leveloffset=+1]
+include::modules/logging-fluentd-collector-alerts.adoc[leveloffset=+1]
+include::modules/cluster-logging-elasticsearch-rules.adoc[leveloffset=+1]
+
+[role="_additional-resources"]
+[id="additional-resources_default-logging-alerts"]
+== Additional resources
+* xref:../../monitoring/managing-alerts.adoc#modifying-core-platform-alerting-rules_managing-alerts[Modifying core platform alerting rules]
diff --git a/logging/logging_alerts/images b/logging/logging_alerts/images
@@ -0,0 +1 @@
+../../images/
diff --git a/logging/logging_alerts/modules b/logging/logging_alerts/modules
@@ -0,0 +1 @@
+../../modules/
diff --git a/logging/logging_alerts/snippets b/logging/logging_alerts/snippets
@@ -0,0 +1 @@
+../../snippets/
diff --git a/logging/logging_alerts/troubleshooting-logging-alerts.adoc b/logging/logging_alerts/troubleshooting-logging-alerts.adoc
@@ -0,0 +1,55 @@
+:_content-type: ASSEMBLY
+[id="troubleshooting-logging-alerts"]
+include::_attributes/common-attributes.adoc[]
+= Troubleshooting logging alerts
+:context: troubleshooting-logging-alerts
+
+toc::[]
+
+You can use the following procedures to troubleshoot logging alerts on your cluster.
+
+include::modules/es-cluster-health-is-red.adoc[leveloffset=+1]
+
+[role="_additional-resources"]
+.Additional resources
+* xref:../../monitoring/reviewing-monitoring-dashboards.adoc#reviewing-monitoring-dashboards[Reviewing monitoring dashboards]
+* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]
+
+[id="elasticsearch-cluster-health-is-yellow"]
+== Elasticsearch cluster health status is yellow
+
+Replica shards for at least one primary shard are not allocated to nodes. Increase the node count by adjusting the `nodeCount` value in the `ClusterLogging` custom resource (CR).
+
+[role="_additional-resources"]
+.Additional resources
+* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]
+
+include::modules/es-node-disk-low-watermark-reached.adoc[leveloffset=+1]
+include::modules/es-node-disk-high-watermark-reached.adoc[leveloffset=+1]
+include::modules/es-node-disk-flood-watermark-reached.adoc[leveloffset=+1]
+
+[id="troubleshooting-logging-alerts-es-jvm-heap-use-is-high"]
+== Elasticsearch JVM heap usage is high
+
+The Elasticsearch node Java virtual machine (JVM) heap memory used is above 75%. Consider https://www.elastic.co/guide/en/elasticsearch/reference/current/advanced-configuration.html#set-jvm-heap-size[increasing the heap size].
+
+[id="troubleshooting-logging-alerts-aggregated-logging-system-cpu-is-high"]
+== Aggregated logging system CPU is high
+
+System CPU usage on the node is high. Check the CPU of the cluster node. Consider allocating more CPU resources to the node.
+
+[id="troubleshooting-logging-alerts-es-process-cpu-is-high"]
+== Elasticsearch process CPU is high
+
+Elasticsearch process CPU usage on the node is high. Check the CPU of the cluster node. Consider allocating more CPU resources to the node.
+
+include::modules/es-disk-space-low.adoc[leveloffset=+1]
+
+[role="_additional-resources"]
+.Additional resources
+* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]
+
+[id="troubleshooting-logging-alerts-es-filedescriptor-usage-is-high"]
+== Elasticsearch FileDescriptor usage is high
+
+Based on current usage trends, the predicted number of file descriptors on the node is insufficient. Check the value of `max_file_descriptors` for each node as described in the Elasticsearch link:https://www.elastic.co/guide/en/elasticsearch/reference/6.8/file-descriptors.html[File Descriptors] documentation.
diff --git a/logging/troubleshooting/cluster-logging-alerts.adoc b/logging/troubleshooting/cluster-logging-alerts.adoc