From c85443723ba792424a89a160f499fb4cce7a10eb Mon Sep 17 00:00:00 2001 From: abrennan89 Date: Wed, 27 Sep 2023 13:43:58 -0500 Subject: [PATCH] OBSDOCS-248: Updating logging alerts docs --- _topic_maps/_topic_map.yml | 13 +- _topic_maps/_topic_map_osd.yml | 13 +- _topic_maps/_topic_map_rosa.yml | 13 +- logging/logging_alerts/_attributes | 1 + .../logging_alerts/custom-logging-alerts.adoc | 42 ++ .../default-logging-alerts.adoc | 21 + logging/logging_alerts/images | 1 + logging/logging_alerts/modules | 1 + logging/logging_alerts/snippets | 1 + .../troubleshooting-logging-alerts.adoc | 55 ++ .../cluster-logging-alerts.adoc | 37 -- ...g-troubleshooting-for-critical-alerts.adoc | 554 ------------------ logging/v5_7/logging-5-7-configuration.adoc | 3 - ...ster-logging-collector-alerts-viewing.adoc | 28 - .../cluster-logging-elasticsearch-rules.adoc | 11 +- modules/configuring-logging-loki-ruler.adoc | 37 ++ modules/es-cluster-health-is-red.adoc | 172 ++++++ modules/es-disk-space-low.adoc | 72 +++ .../es-node-disk-flood-watermark-reached.adoc | 89 +++ .../es-node-disk-high-watermark-reached.adoc | 78 +++ .../es-node-disk-low-watermark-reached.adoc | 90 +++ ...adoc => logging-enabling-loki-alerts.adoc} | 34 +- ... => logging-fluentd-collector-alerts.adoc} | 29 +- modules/logging-vector-collector-alerts.adoc | 36 ++ modules/loki-rbac-permissions.adoc | 28 + .../monitoring-accessing-the-alerting-ui.adoc | 3 +- snippets/es-pod-var-logging.adoc | 31 + support/index.adoc | 2 +- 28 files changed, 818 insertions(+), 677 deletions(-) create mode 120000 logging/logging_alerts/_attributes create mode 100644 logging/logging_alerts/custom-logging-alerts.adoc create mode 100644 logging/logging_alerts/default-logging-alerts.adoc create mode 120000 logging/logging_alerts/images create mode 120000 logging/logging_alerts/modules create mode 120000 logging/logging_alerts/snippets create mode 100644 logging/logging_alerts/troubleshooting-logging-alerts.adoc delete mode 100644 logging/troubleshooting/cluster-logging-alerts.adoc delete mode 100644 logging/troubleshooting/cluster-logging-troubleshooting-for-critical-alerts.adoc delete mode 100644 modules/cluster-logging-collector-alerts-viewing.adoc create mode 100644 modules/configuring-logging-loki-ruler.adoc create mode 100644 modules/es-cluster-health-is-red.adoc create mode 100644 modules/es-disk-space-low.adoc create mode 100644 modules/es-node-disk-flood-watermark-reached.adoc create mode 100644 modules/es-node-disk-high-watermark-reached.adoc create mode 100644 modules/es-node-disk-low-watermark-reached.adoc rename modules/{logging-loki-alerts.adoc => logging-enabling-loki-alerts.adoc} (75%) rename modules/{cluster-logging-collector-alerts.adoc => logging-fluentd-collector-alerts.adoc} (57%) create mode 100644 modules/logging-vector-collector-alerts.adoc create mode 100644 modules/loki-rbac-permissions.adoc create mode 100644 snippets/es-pod-var-logging.adoc diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index 849bee8febee..3fd23050b4d0 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -2546,6 +2546,15 @@ Topics: File: cluster-logging-upgrading - Name: Viewing cluster dashboards File: cluster-logging-dashboards +- Name: Logging alerts + Dir: logging_alerts + Topics: + - Name: Default logging alerts + File: default-logging-alerts + - Name: Custom logging alerts + File: custom-logging-alerts + - Name: Troubleshooting logging alerts + File: troubleshooting-logging-alerts - Name: Troubleshooting Logging Dir: troubleshooting Distros: openshift-enterprise,openshift-origin @@ -2554,10 +2563,6 @@ Topics: File: cluster-logging-cluster-status - Name: Viewing the status of the log store File: cluster-logging-log-store-status - - Name: Understanding Logging alerts - File: cluster-logging-alerts - - Name: Troubleshooting for Critical Alerts - File: cluster-logging-troubleshooting-for-critical-alerts - Name: Uninstalling Logging File: cluster-logging-uninstall - Name: Exported fields diff --git a/_topic_maps/_topic_map_osd.yml b/_topic_maps/_topic_map_osd.yml index dffe0561ea6f..fb71bb995247 100644 --- a/_topic_maps/_topic_map_osd.yml +++ b/_topic_maps/_topic_map_osd.yml @@ -702,6 +702,15 @@ Topics: File: cluster-logging-upgrading - Name: Viewing cluster dashboards File: cluster-logging-dashboards +- Name: Logging alerts + Dir: logging_alerts + Topics: + - Name: Default logging alerts + File: default-logging-alerts + - Name: Custom logging alerts + File: custom-logging-alerts + - Name: Troubleshooting logging alerts + File: troubleshooting-logging-alerts - Name: Troubleshooting Logging Dir: troubleshooting Topics: @@ -709,10 +718,6 @@ Topics: File: cluster-logging-cluster-status - Name: Viewing the status of the log store File: cluster-logging-log-store-status - - Name: Understanding Logging alerts - File: cluster-logging-alerts - - Name: Troubleshooting for Critical Alerts - File: cluster-logging-troubleshooting-for-critical-alerts - Name: Uninstalling Logging File: cluster-logging-uninstall - Name: Exported fields diff --git a/_topic_maps/_topic_map_rosa.yml b/_topic_maps/_topic_map_rosa.yml index a80925e56fe4..76a3ed8eb082 100644 --- a/_topic_maps/_topic_map_rosa.yml +++ b/_topic_maps/_topic_map_rosa.yml @@ -871,6 +871,15 @@ Topics: File: cluster-logging-upgrading - Name: Viewing cluster dashboards File: cluster-logging-dashboards +- Name: Logging alerts + Dir: logging_alerts + Topics: + - Name: Default logging alerts + File: default-logging-alerts + - Name: Custom logging alerts + File: custom-logging-alerts + - Name: Troubleshooting logging alerts + File: troubleshooting-logging-alerts - Name: Troubleshooting Logging Dir: troubleshooting Topics: @@ -878,10 +887,6 @@ Topics: File: cluster-logging-cluster-status - Name: Viewing the status of the log store File: cluster-logging-log-store-status - - Name: Understanding Logging alerts - File: cluster-logging-alerts - - Name: Troubleshooting for Critical Alerts - File: cluster-logging-troubleshooting-for-critical-alerts - Name: Uninstalling Logging File: cluster-logging-uninstall - Name: Exported fields diff --git a/logging/logging_alerts/_attributes b/logging/logging_alerts/_attributes new file mode 120000 index 000000000000..20cc1dcb77bf --- /dev/null +++ b/logging/logging_alerts/_attributes @@ -0,0 +1 @@ +../../_attributes/ \ No newline at end of file diff --git a/logging/logging_alerts/custom-logging-alerts.adoc b/logging/logging_alerts/custom-logging-alerts.adoc new file mode 100644 index 000000000000..d0d9280cf596 --- /dev/null +++ b/logging/logging_alerts/custom-logging-alerts.adoc @@ -0,0 +1,42 @@ +:_content-type: ASSEMBLY +[id="custom-logging-alerts"] +include::_attributes/common-attributes.adoc[] += Custom logging alerts +:context: custom-logging-alerts + +toc::[] + +In logging 5.7 and later versions, users can configure the LokiStack deployment to produce customized alerts and recorded metrics. If you want to use customized link:https://grafana.com/docs/loki/latest/alert/[alerting and recording rules], you must enable the LokiStack ruler component. + +LokiStack log-based alerts and recorded metrics are triggered by providing link:https://grafana.com/docs/loki/latest/query/[LogQL] expressions to the ruler component. The Loki Operator manages a ruler that is optimized for the selected LokiStack size, which can be `1x.extra-small`, `1x.small`, or `1x.medium`. + +[NOTE] +==== +The `1x.extra-small` size is not supported. It is for demonstration purposes only. +==== + +To provide these expressions, you must create an `AlertingRule` custom resource (CR) containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting rules], or a `RecordingRule` CR containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording rules]. + +Administrators can configure log-based alerts or recorded metrics for `application`, `audit`, or `infrastructure` tenants. Users without administrator permissions can configure log-based alerts or recorded metrics for `application` tenants of the applications that they have access to. + +Application, audit, and infrastructure alerts are sent by default to the {product-title} monitoring stack Alertmanager in the `openshift-monitoring` namespace, unless you have disabled the local Alertmanager instance. If the Alertmanager that is used to monitor user-defined projects in the `openshift-user-workload-monitoring` namespace is enabled, application alerts are sent to the Alertmanager in this namespace by default. + +include::modules/configuring-logging-loki-ruler.adoc[leveloffset=+1] +include::modules/loki-rbac-permissions.adoc[leveloffset=+1] + +ifdef::openshift-enterprise[] +[role="_additional-resources"] +.Additional resources +* xref:../../authentication/using-rbac.adoc#using-rbac[Using RBAC to define and apply permissions] +endif::[] + +include::modules/logging-enabling-loki-alerts.adoc[leveloffset=+1] + +[role="_additional-resources"] +[id="additional-resources_custom-logging-alerts"] +== Additional resources +* xref:../../monitoring/monitoring-overview.adoc#about-openshift-monitoring_monitoring-overview[About {product-title} monitoring] +ifdef::openshift-enterprise[] +* xref:../../post_installation_configuration/configuring-alert-notifications.adoc#configuring-alert-notifications[Configuring alert notifications] +endif::[] +// maybe need an update to https://docs.openshift.com/container-platform/4.13/monitoring/monitoring-overview.html#default-monitoring-targets_monitoring-overview to talk about Loki and Vector now? Are these part of default monitoring? diff --git a/logging/logging_alerts/default-logging-alerts.adoc b/logging/logging_alerts/default-logging-alerts.adoc new file mode 100644 index 000000000000..b7eb468af4cb --- /dev/null +++ b/logging/logging_alerts/default-logging-alerts.adoc @@ -0,0 +1,21 @@ +:_content-type: ASSEMBLY +[id="default-logging-alerts"] +include::_attributes/common-attributes.adoc[] += Default logging alerts +:context: default-logging-alerts + +toc::[] + +Logging alerts are installed as part of the Cluster Logging Operator installation. Alerts depend on metrics exported by the log collection and log storage backends. These metrics are enabled if you selected the option to *Enable operator recommended cluster monitoring on this namespace* when installing the Cluster Logging Operator. For more information about installing logging Operators, see xref:../../logging/cluster-logging-deploying#cluster-logging-deploy-console_cluster-logging-deploying[Installing the {logging-title} using the web console]. + +Default logging alerts are sent to the {product-title} monitoring stack Alertmanager in the `openshift-monitoring` namespace, unless you have disabled the local Alertmanager instance. + +include::modules/monitoring-accessing-the-alerting-ui.adoc[leveloffset=+1] +include::modules/logging-vector-collector-alerts.adoc[leveloffset=+1] +include::modules/logging-fluentd-collector-alerts.adoc[leveloffset=+1] +include::modules/cluster-logging-elasticsearch-rules.adoc[leveloffset=+1] + +[role="_additional-resources"] +[id="additional-resources_default-logging-alerts"] +== Additional resources +* xref:../../monitoring/managing-alerts.adoc#modifying-core-platform-alerting-rules_managing-alerts[Modifying core platform alerting rules] diff --git a/logging/logging_alerts/images b/logging/logging_alerts/images new file mode 120000 index 000000000000..847b03ed0541 --- /dev/null +++ b/logging/logging_alerts/images @@ -0,0 +1 @@ +../../images/ \ No newline at end of file diff --git a/logging/logging_alerts/modules b/logging/logging_alerts/modules new file mode 120000 index 000000000000..36719b9de743 --- /dev/null +++ b/logging/logging_alerts/modules @@ -0,0 +1 @@ +../../modules/ \ No newline at end of file diff --git a/logging/logging_alerts/snippets b/logging/logging_alerts/snippets new file mode 120000 index 000000000000..5a3f5add140e --- /dev/null +++ b/logging/logging_alerts/snippets @@ -0,0 +1 @@ +../../snippets/ \ No newline at end of file diff --git a/logging/logging_alerts/troubleshooting-logging-alerts.adoc b/logging/logging_alerts/troubleshooting-logging-alerts.adoc new file mode 100644 index 000000000000..be2091257b55 --- /dev/null +++ b/logging/logging_alerts/troubleshooting-logging-alerts.adoc @@ -0,0 +1,55 @@ +:_content-type: ASSEMBLY +[id="troubleshooting-logging-alerts"] +include::_attributes/common-attributes.adoc[] += Troubleshooting logging alerts +:context: troubleshooting-logging-alerts + +toc::[] + +You can use the following procedures to troubleshoot logging alerts on your cluster. + +include::modules/es-cluster-health-is-red.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../monitoring/reviewing-monitoring-dashboards.adoc#reviewing-monitoring-dashboards[Reviewing monitoring dashboards] +* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status] + +[id="elasticsearch-cluster-health-is-yellow"] +== Elasticsearch cluster health status is yellow + +Replica shards for at least one primary shard are not allocated to nodes. Increase the node count by adjusting the `nodeCount` value in the `ClusterLogging` custom resource (CR). + +[role="_additional-resources"] +.Additional resources +* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status] + +include::modules/es-node-disk-low-watermark-reached.adoc[leveloffset=+1] +include::modules/es-node-disk-high-watermark-reached.adoc[leveloffset=+1] +include::modules/es-node-disk-flood-watermark-reached.adoc[leveloffset=+1] + +[id="troubleshooting-logging-alerts-es-jvm-heap-use-is-high"] +== Elasticsearch JVM heap usage is high + +The Elasticsearch node Java virtual machine (JVM) heap memory used is above 75%. Consider https://www.elastic.co/guide/en/elasticsearch/reference/current/advanced-configuration.html#set-jvm-heap-size[increasing the heap size]. + +[id="troubleshooting-logging-alerts-aggregated-logging-system-cpu-is-high"] +== Aggregated logging system CPU is high + +System CPU usage on the node is high. Check the CPU of the cluster node. Consider allocating more CPU resources to the node. + +[id="troubleshooting-logging-alerts-es-process-cpu-is-high"] +== Elasticsearch process CPU is high + +Elasticsearch process CPU usage on the node is high. Check the CPU of the cluster node. Consider allocating more CPU resources to the node. + +include::modules/es-disk-space-low.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status] + +[id="troubleshooting-logging-alerts-es-filedescriptor-usage-is-high"] +== Elasticsearch FileDescriptor usage is high + +Based on current usage trends, the predicted number of file descriptors on the node is insufficient. Check the value of `max_file_descriptors` for each node as described in the Elasticsearch link:https://www.elastic.co/guide/en/elasticsearch/reference/6.8/file-descriptors.html[File Descriptors] documentation. diff --git a/logging/troubleshooting/cluster-logging-alerts.adoc b/logging/troubleshooting/cluster-logging-alerts.adoc deleted file mode 100644 index 63079a60893e..000000000000 --- a/logging/troubleshooting/cluster-logging-alerts.adoc +++ /dev/null @@ -1,37 +0,0 @@ -:_content-type: ASSEMBLY -:context: cluster-logging-alerts -[id="cluster-logging-alerts"] -= Understanding {logging} alerts -include::_attributes/common-attributes.adoc[] -include::_attributes/attributes-openshift-dedicated.adoc[] - -toc::[] - -All of the logging collector alerts are listed on the Alerting UI of the -ifndef::openshift-rosa,openshift-dedicated[] -{product-title} web console. -endif::[] -ifdef::openshift-rosa,openshift-dedicated[] -{cluster-manager-url}. -endif::[] - -// The following include statements pull in the module files that comprise -// the assembly. Include any combination of concept, procedure, or reference -// modules required to cover the user story. You can also include other -// assemblies. - - -include::modules/cluster-logging-collector-alerts-viewing.adoc[leveloffset=+1] - -[role="_additional-resources"] -.Additional resources -* For more information on the Alerting UI, see -ifdef::openshift-enterprise,openshift-origin[] -xref:../../monitoring/managing-alerts.adoc#managing-alerts[Managing alerts]. -endif::[] -ifdef::openshift-rosa,openshift-dedicated[] -link:https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html#managing-alerts[Managing alerts]. -endif::[] - -include::modules/cluster-logging-collector-alerts.adoc[leveloffset=+1] -include::modules/cluster-logging-elasticsearch-rules.adoc[leveloffset=+1] diff --git a/logging/troubleshooting/cluster-logging-troubleshooting-for-critical-alerts.adoc b/logging/troubleshooting/cluster-logging-troubleshooting-for-critical-alerts.adoc deleted file mode 100644 index d6d9614047fd..000000000000 --- a/logging/troubleshooting/cluster-logging-troubleshooting-for-critical-alerts.adoc +++ /dev/null @@ -1,554 +0,0 @@ -:_content-type: ASSEMBLY -[id="cluster-logging-troubleshooting-for-critical-alerts"] -= Troubleshooting for Critical Alerts -include::_attributes/common-attributes.adoc[] - -toc::[] -:toclevels: 2 - -// WARNING - DO NOT ALTER THE URL PATH OF THIS CONTENT, OR YOU WILL BREAK LINKS FROM ALERT MESSAGES THAT LINK TO THIS CONTENT. -// However, if you must make such changes, consult with the logging team beforehand. - - -[id="elasticsearch-cluster-health-is-red"] -== Elasticsearch Cluster Health is Red - -At least one primary shard and its replicas are not allocated to a node. - -.Troubleshooting - -. Check the Elasticsearch cluster health and verify that the cluster `status` is red. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- health ----- - -. List the nodes that have joined the cluster. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cat/nodes?v ----- - -. List the Elasticsearch pods and compare them with the nodes in the command output from the previous step. -+ -[source,terminal] ----- -oc -n openshift-logging get pods -l component=elasticsearch ----- - -. If some of the Elasticsearch nodes have not joined the cluster, perform the following steps. - -.. Confirm that Elasticsearch has an elected control plane node. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cat/master?v ----- - -.. Review the pod logs of the elected control plane node for issues. -+ -[source,terminal] ----- -oc logs -c elasticsearch -n openshift-logging ----- - -.. Review the logs of nodes that have not joined the cluster for issues. -+ -[source,terminal] ----- -oc logs -c elasticsearch -n openshift-logging ----- - -. If all the nodes have joined the cluster, perform the following steps, check if the cluster is in the process of recovering. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cat/recovery?active_only=true ----- -+ -If there is no command output, the recovery process might be delayed or stalled by pending tasks. - -. Check if there are pending tasks. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- health |grep number_of_pending_tasks ----- - -. If there are pending tasks, monitor their status. -+ -If their status changes and indicates that the cluster is recovering, continue waiting. The recovery time varies according to the size of the cluster and other factors. -+ -Otherwise, if the status of the pending tasks does not change, this indicates that the recovery has stalled. - -. If it seems like the recovery has stalled, check if `cluster.routing.allocation.enable` is set to `none`. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cluster/settings?pretty ----- - -. If `cluster.routing.allocation.enable` is set to `none`, set it to `all`. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cluster/settings?pretty -X PUT -d '{"persistent": {"cluster.routing.allocation.enable":"all"}}' ----- - -. Check which indices are still red. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cat/indices?v ----- - -. If any indices are still red, try to clear them by performing the following steps. - -.. Clear the cache. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=/_cache/clear?pretty ----- - -.. Increase the max allocation retries. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=/_settings?pretty -X PUT -d '{"index.allocation.max_retries":10}' ----- - -.. Delete all the scroll items. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_search/scroll/_all -X DELETE ----- - -.. Increase the timeout. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=/_settings?pretty -X PUT -d '{"index.unassigned.node_left.delayed_timeout":"10m"}' ----- - -. If the preceding steps do not clear the red indices, delete the indices individually. - -.. Identify the red index name. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cat/indices?v ----- - -.. Delete the red index. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query= -X DELETE ----- - -. If there are no red indices and the cluster status is red, check for a continuous heavy processing load on a data node. - -.. Check if the Elasticsearch JVM Heap usage is high. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_nodes/stats?pretty ----- -+ -In the command output, review the `node_name.jvm.mem.heap_used_percent` field to determine the JVM Heap usage. - -.. Check for high CPU utilization. - -[role="_additional-resources"] -.Additional resources - -* Search for "Free up or increase disk space" in the Elasticsearch topic, link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]. - -[id="elasticsearch-cluster-health-is-yellow"] -== Elasticsearch Cluster Health is Yellow - -Replica shards for at least one primary shard are not allocated to nodes. - -.Troubleshooting - -. Increase the node count by adjusting `nodeCount` in the `ClusterLogging` CR. - -[role="_additional-resources"] -.Additional resources - -//* Search for "Elasticsearch Disk Usage" in xref:../../logging/cluster-logging-dashboards.adoc#cluster-logging-dashboards-logging_cluster-logging-dashboards[OpenShift Logging dashboards]. -* xref:../../logging/config/cluster-logging-configuring-cr.adoc#cluster-logging-configuring-crd_cluster-logging-configuring-cr[About the Cluster Logging custom resource] -* xref:../../logging/config/cluster-logging-log-store.adoc#cluster-logging-elasticsearch-storage_cluster-logging-log-store[Configuring persistent storage for the log store] - -* Search for "Free up or increase disk space" in the Elasticsearch topic, link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]. - - -// [id="elasticsearch-write-requests-rejection-jumps"] -// == Elasticsearch Write Requests Rejection Jumps -// -// .Troubleshooting -// TBD -// Note for writer: This is a warning alert and we haven't documented troubleshooting steps for warning alerts yet. I guess you can skip this in current release. - -[id="elasticsearch-node-disk-low-watermark-reached"] -== Elasticsearch Node Disk Low Watermark Reached - -Elasticsearch does not allocate shards to nodes that https://www.elastic.co/guide/en/elasticsearch/reference/6.8/disk-allocator.html[reach the low watermark]. - -.Troubleshooting - -. Identify the node on which Elasticsearch is deployed. -+ -[source,terminal] ----- -oc -n openshift-logging get po -o wide ----- - -. Check if there are `unassigned shards`. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cluster/health?pretty | grep unassigned_shards ----- - -. If there are unassigned shards, check the disk space on each node. -+ -[source,terminal] ----- -for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod -- df -h /elasticsearch/persistent; done ----- - -. Check the `nodes.node_name.fs` field to determine the free disk space on that node. -+ -If the used disk percentage is above 85%, the node has exceeded the low watermark, and shards can no longer be allocated to this node. - -. Try to increase the disk space on all nodes. - -. If increasing the disk space is not possible, try adding a new data node to the cluster. - -. If adding a new data node is problematic, decrease the total cluster redundancy policy. - -.. Check the current `redundancyPolicy`. -+ -[source,terminal] ----- -oc -n openshift-logging get es elasticsearch -o jsonpath='{.spec.redundancyPolicy}' ----- -+ -[NOTE] -==== -If you are using a `ClusterLogging` CR, enter: - -[source,terminal] ----- -oc -n openshift-logging get cl -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' ----- -==== - -.. If the cluster `redundancyPolicy` is higher than `SingleRedundancy`, set it to `SingleRedundancy` and save this change. - -. If the preceding steps do not fix the issue, delete the old indices. - -.. Check the status of all indices on Elasticsearch. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- indices ----- - -.. Identify an old index that can be deleted. - -.. Delete the index. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query= -X DELETE ----- - -[role="_additional-resources"] -.Additional resources - -* Search for "redundancyPolicy" in the "Sample `ClusterLogging` custom resource (CR)" in xref:../../logging/config/cluster-logging-configuring-cr.adoc#cluster-logging-configuring-crd_cluster-logging-configuring-cr[About the Cluster Logging custom resource] - - -[id="elasticsearch-node-disk-high-watermark-reached"] -== Elasticsearch Node Disk High Watermark Reached - -Elasticsearch attempts to relocate shards away from a node link:https://www.elastic.co/guide/en/elasticsearch/reference/6.8/disk-allocator.html[that has reached the high watermark]. - -.Troubleshooting - -. Identify the node on which Elasticsearch is deployed. -+ -[source,terminal] ----- -oc -n openshift-logging get po -o wide ----- - -. Check the disk space on each node. -+ -[source,terminal] ----- -for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod -- df -h /elasticsearch/persistent; done ----- - -. Check if the cluster is rebalancing. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_cluster/health?pretty | grep relocating_shards ----- -+ -If the command output shows relocating shards, the High Watermark has been exceeded. The default value of the High Watermark is 90%. -+ -The shards relocate to a node with low disk usage that has not crossed any watermark threshold limits. - -. To allocate shards to a particular node, free up some space. - -. Try to increase the disk space on all nodes. - -. If increasing the disk space is not possible, try adding a new data node to the cluster. - -. If adding a new data node is problematic, decrease the total cluster redundancy policy. - -.. Check the current `redundancyPolicy`. -+ -[source,terminal] ----- -oc -n openshift-logging get es elasticsearch -o jsonpath='{.spec.redundancyPolicy}' ----- -+ -[NOTE] -==== -If you are using a `ClusterLogging` CR, enter: - -[source,terminal] ----- -oc -n openshift-logging get cl -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' ----- -==== - -.. If the cluster `redundancyPolicy` is higher than `SingleRedundancy`, set it to `SingleRedundancy` and save this change. - -. If the preceding steps do not fix the issue, delete the old indices. - -.. Check the status of all indices on Elasticsearch. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- indices ----- - -.. Identify an old index that can be deleted. - -.. Delete the index. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query= -X DELETE ----- - -[role="_additional-resources"] -.Additional resources - -* Search for "redundancyPolicy" in the "Sample `ClusterLogging` custom resource (CR)" in xref:../../logging/config/cluster-logging-configuring-cr.adoc#cluster-logging-configuring-crd_cluster-logging-configuring-cr[About the Cluster Logging custom resource] - - -[id="elasticsearch-node-disk-flood-watermark-reached"] -== Elasticsearch Node Disk Flood Watermark Reached - -Elasticsearch enforces a read-only index block on every index that has both of these conditions: - -* One or more shards are allocated to the node. -* One or more disks exceed the https://www.elastic.co/guide/en/elasticsearch/reference/6.8/disk-allocator.html[flood stage]. - -.Troubleshooting - -. Check the disk space of the Elasticsearch node. -+ -[source,terminal] ----- -for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod -- df -h /elasticsearch/persistent; done ----- -+ -Check the `nodes.node_name.fs` field to determine the free disk space on that node. - -. If the used disk percentage is above 95%, it signifies that the node has crossed the flood watermark. Writing is blocked for shards allocated on this particular node. - -. Try to increase the disk space on all nodes. - -. If increasing the disk space is not possible, try adding a new data node to the cluster. - -. If adding a new data node is problematic, decrease the total cluster redundancy policy. - -.. Check the current `redundancyPolicy`. -+ -[source,terminal] ----- -oc -n openshift-logging get es elasticsearch -o jsonpath='{.spec.redundancyPolicy}' ----- -+ -[NOTE] -==== -If you are using a `ClusterLogging` CR, enter: - -[source,terminal] ----- -oc -n openshift-logging get cl -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' ----- -==== - -.. If the cluster `redundancyPolicy` is higher than `SingleRedundancy`, set it to `SingleRedundancy` and save this change. - -. If the preceding steps do not fix the issue, delete the old indices. - -.. Check the status of all indices on Elasticsearch. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- indices ----- - -.. Identify an old index that can be deleted. - -.. Delete the index. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query= -X DELETE ----- - - . Continue freeing up and monitoring the disk space until the used disk space drops below 90%. Then, unblock write to this particular node. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query=_all/_settings?pretty -X PUT -d '{"index.blocks.read_only_allow_delete": null}' ----- - -[role="_additional-resources"] -.Additional resources - -* Search for "redundancyPolicy" in the "Sample `ClusterLogging` custom resource (CR)" in xref:../../logging/config/cluster-logging-configuring-cr.adoc#cluster-logging-configuring-crd_cluster-logging-configuring-cr[About the Cluster Logging custom resource] - - -[id="elasticsearch-jvm-heap-use-is-high"] -== Elasticsearch JVM Heap Use is High - -The Elasticsearch node JVM Heap memory used is above 75%. - -.Troubleshooting - -Consider https://www.elastic.co/guide/en/elasticsearch/reference/current/important-settings.html#heap-size-settings[increasing the heap size]. - -[id="aggregated-logging-system-cpu-is-high"] -== Aggregated Logging System CPU is High - -System CPU usage on the node is high. - -.Troubleshooting - -Check the CPU of the cluster node. Consider allocating more CPU resources to the node. - -[id="elasticsearch-process-cpu-is-high"] -== Elasticsearch Process CPU is High - -Elasticsearch process CPU usage on the node is high. - -.Troubleshooting - -Check the CPU of the cluster node. Consider allocating more CPU resources to the node. - -[id="elasticsearch-disk-space-is-running-low"] -== Elasticsearch Disk Space is Running Low - -The Elasticsearch Cluster is predicted to be out of disk space within the next 6 hours based on current disk usage. - -.Troubleshooting - -. Get the disk space of the Elasticsearch node. -+ -[source,terminal] ----- -for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod -- df -h /elasticsearch/persistent; done ----- - -. In the command output, check the `nodes.node_name.fs` field to determine the free disk space on that node. - -. Try to increase the disk space on all nodes. - -. If increasing the disk space is not possible, try adding a new data node to the cluster. - -. If adding a new data node is problematic, decrease the total cluster redundancy policy. - -.. Check the current `redundancyPolicy`. -+ -[source,terminal] ----- -oc -n openshift-logging get es elasticsearch -o jsonpath='{.spec.redundancyPolicy}' ----- -+ -[NOTE] -==== -If you are using a `ClusterLogging` CR, enter: - -[source,terminal] ----- -oc -n openshift-logging get cl -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' ----- -==== - -.. If the cluster `redundancyPolicy` is higher than `SingleRedundancy`, set it to `SingleRedundancy` and save this change. - -. If the preceding steps do not fix the issue, delete the old indices. - -.. Check the status of all indices on Elasticsearch. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- indices ----- - -.. Identify an old index that can be deleted. - -.. Delete the index. -+ -[source,terminal] ----- -oc exec -n openshift-logging -c elasticsearch -- es_util --query= -X DELETE ----- - -[role="_additional-resources"] -.Additional resources - -* Search for "redundancyPolicy" in the "Sample `ClusterLogging` custom resource (CR)" in xref:../../logging/config/cluster-logging-configuring-cr.adoc#cluster-logging-configuring-crd_cluster-logging-configuring-cr[About the Cluster Logging custom resource] - -* Search for "ElasticsearchDiskSpaceRunningLow" in xref:../../logging/troubleshooting/cluster-logging-alerts.adoc#cluster-logging-elasticsearch-rules_cluster-logging-alerts[About Elasticsearch alerting rules]. - -* Search for "Free up or increase disk space" in the Elasticsearch topic, link:https://www.elastic.co/guide/en/elasticsearch/reference/7.13/fix-common-cluster-issues.html#fix-red-yellow-cluster-status[Fix a red or yellow cluster status]. - - - -[id="elasticsearch-filedescriptor-usage-is-high"] -== Elasticsearch FileDescriptor Usage is high - -Based on current usage trends, the predicted number of file descriptors on the node is insufficient. - -.Troubleshooting - -Check and, if needed, configure the value of `max_file_descriptors` for each node, as described in the Elasticsearch link:https://www.elastic.co/guide/en/elasticsearch/reference/current/file-descriptors.html[File descriptors] topic. - -[role="_additional-resources"] -.Additional resources - -* Search for "ElasticsearchHighFileDescriptorUsage" in xref:../../logging/troubleshooting/cluster-logging-alerts.adoc#cluster-logging-elasticsearch-rules_cluster-logging-alerts[About Elasticsearch alerting rules]. -* Search for "File Descriptors In Use" in xref:../../logging/cluster-logging-dashboards.adoc#cluster-logging-dashboards-logging_cluster-logging-dashboards[OpenShift Logging dashboards]. - - - -// Follow up items: - -// `oc edit es elasticsearch` is not documented anywhere outside this topic. diff --git a/logging/v5_7/logging-5-7-configuration.adoc b/logging/v5_7/logging-5-7-configuration.adoc index d06b828965b4..c6d38afecb7c 100644 --- a/logging/v5_7/logging-5-7-configuration.adoc +++ b/logging/v5_7/logging-5-7-configuration.adoc @@ -7,6 +7,3 @@ include::_attributes/common-attributes.adoc[] toc::[] include::modules/logging-multiline-except.adoc[leveloffset=+1] - -//OCP 4.13+ only -include::modules/logging-loki-alerts.adoc[leveloffset=+1] diff --git a/modules/cluster-logging-collector-alerts-viewing.adoc b/modules/cluster-logging-collector-alerts-viewing.adoc deleted file mode 100644 index 6d11df002d37..000000000000 --- a/modules/cluster-logging-collector-alerts-viewing.adoc +++ /dev/null @@ -1,28 +0,0 @@ -// Module included in the following assemblies: -// -// * logging/cluster-logging-collector.adoc - -:_content-type: PROCEDURE -[id="cluster-logging-collector-alerts-viewing_{context}"] -= Viewing logging collector alerts - -Alerts are shown in the -ifndef::openshift-rosa,openshift-dedicated[] -{product-title} web console, -endif::[] -ifdef::openshift-rosa,openshift-dedicated[] -{cluster-manager-url}, -endif::[] -on the *Alerts* tab of the Alerting UI. Alerts are in one of the following states: - -* *Firing*. The alert condition is true for the duration of the timeout. Click the *Options* menu at the end of the firing alert to view more information or silence the alert. -* *Pending* The alert condition is currently true, but the timeout has not been reached. -* *Not Firing*. The alert is not currently triggered. - -.Procedure - -To view the {logging} and other {product-title} alerts: - -. In the {product-title} console, click *Observe* → *Alerting*. - -. Click the *Alerts* tab. The alerts are listed, based on the filters selected. diff --git a/modules/cluster-logging-elasticsearch-rules.adoc b/modules/cluster-logging-elasticsearch-rules.adoc index 1e4ba49ad24e..8894d10bc2ce 100644 --- a/modules/cluster-logging-elasticsearch-rules.adoc +++ b/modules/cluster-logging-elasticsearch-rules.adoc @@ -1,8 +1,12 @@ -:_content-type: CONCEPT +// Module included in the following assemblies: +// +// * logging/logging_alerts/default-logging-alerts.adoc + +:_content-type: REFERENCE [id="cluster-logging-elasticsearch-rules_{context}"] -= About Elasticsearch alerting rules += Elasticsearch alerting rules -You can view these alerting rules in Prometheus. +You can view these alerting rules in the {product-title} web console. .Alerting rules [cols="3,6,1",options="header"] @@ -11,7 +15,6 @@ You can view these alerting rules in Prometheus. |Description |Severity - |`ElasticsearchClusterNotHealthy` |The cluster health status has been RED for at least 2 minutes. The cluster does not accept writes, shards may be missing, or the master node hasn't been elected yet. diff --git a/modules/configuring-logging-loki-ruler.adoc b/modules/configuring-logging-loki-ruler.adoc new file mode 100644 index 000000000000..e94dab1f64f2 --- /dev/null +++ b/modules/configuring-logging-loki-ruler.adoc @@ -0,0 +1,37 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/custom-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="configuring-logging-loki-ruler_{context}"] += Configuring the ruler + +When the LokiStack ruler component is enabled, users can define a group of link:https://grafana.com/docs/loki/latest/query/[LogQL] expressions that trigger logging alerts or recorded metrics. + +Administrators can enable the ruler by modifying the `LokiStack` custom resource (CR). + +.Procedure + +* Enable the ruler by ensuring that the `LokiStack` CR contains the following spec configuration: ++ +[source,yaml] +---- +apiVersion: loki.grafana.com/v1 +kind: LokiStack +metadata: + name: + namespace: +spec: +# ... + rules: + enabled: true <1> + selector: + matchLabels: + openshift.io/: "true" <2> + namespaceSelector: + matchLabels: + openshift.io/: "true" <3> +---- +<1> Enable Loki alerting and recording rules in your cluster. +<2> Add a custom label that can be added to namespaces where you want to enable the use of logging alerts and metrics. +<3> Add a custom label that can be added to namespaces where you want to enable the use of logging alerts and metrics. diff --git a/modules/es-cluster-health-is-red.adoc b/modules/es-cluster-health-is-red.adoc new file mode 100644 index 000000000000..bffcdc0eb897 --- /dev/null +++ b/modules/es-cluster-health-is-red.adoc @@ -0,0 +1,172 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/troubleshooting-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="es-cluster-health-is-red_{context}"] += Elasticsearch cluster health status is red + +At least one primary shard and its replicas are not allocated to a node. Use the following procedure to troubleshoot this alert. + +include::snippets/es-pod-var-logging.adoc[] + +.Procedure + +. Check the Elasticsearch cluster health and verify that the cluster `status` is red by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME -- health +---- + +. List the nodes that have joined the cluster by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cat/nodes?v +---- + +. List the Elasticsearch pods and compare them with the nodes in the command output from the previous step, by running the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get pods -l component=elasticsearch +---- + +. If some of the Elasticsearch nodes have not joined the cluster, perform the following steps. + +.. Confirm that Elasticsearch has an elected master node by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cat/master?v +---- + +.. Review the pod logs of the elected master node for issues by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc logs -c elasticsearch -n openshift-logging +---- + +.. Review the logs of nodes that have not joined the cluster for issues by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc logs -c elasticsearch -n openshift-logging +---- + +. If all the nodes have joined the cluster, check if the cluster is in the process of recovering by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cat/recovery?active_only=true +---- ++ +If there is no command output, the recovery process might be delayed or stalled by pending tasks. + +. Check if there are pending tasks by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- health | grep number_of_pending_tasks +---- + +. If there are pending tasks, monitor their status. If their status changes and indicates that the cluster is recovering, continue waiting. The recovery time varies according to the size of the cluster and other factors. Otherwise, if the status of the pending tasks does not change, this indicates that the recovery has stalled. + +. If it seems like the recovery has stalled, check if the `cluster.routing.allocation.enable` value is set to `none`, by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cluster/settings?pretty +---- + +. If the `cluster.routing.allocation.enable` value is set to `none`, set it to `all`, by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cluster/settings?pretty \ + -X PUT -d '{"persistent": {"cluster.routing.allocation.enable":"all"}}' +---- + +. Check if any indices are still red by running the following command and observing the output: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cat/indices?v +---- + +. If any indices are still red, try to clear them by performing the following steps. + +.. Clear the cache by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=/_cache/clear?pretty +---- + +.. Increase the max allocation retries by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=/_settings?pretty \ + -X PUT -d '{"index.allocation.max_retries":10}' +---- + +.. Delete all the scroll items by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_search/scroll/_all -X DELETE +---- + +.. Increase the timeout by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=/_settings?pretty \ + -X PUT -d '{"index.unassigned.node_left.delayed_timeout":"10m"}' +---- + +. If the preceding steps do not clear the red indices, delete the indices individually. + +.. Identify the red index name by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cat/indices?v +---- + +.. Delete the red index by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query= -X DELETE +---- + +. If there are no red indices and the cluster status is red, check for a continuous heavy processing load on a data node. + +.. Check if the Elasticsearch JVM Heap usage is high by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_nodes/stats?pretty +---- ++ +In the command output, review the `node_name.jvm.mem.heap_used_percent` field to determine the JVM Heap usage. + +.. Check for high CPU utilization. For more information about CPU utilitzation, see the {product-title} "Reviewing monitoring dashboards" documentation. diff --git a/modules/es-disk-space-low.adoc b/modules/es-disk-space-low.adoc new file mode 100644 index 000000000000..9cac93bc245c --- /dev/null +++ b/modules/es-disk-space-low.adoc @@ -0,0 +1,72 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/troubleshooting-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="es-disk-space-low_{context}"] += Elasticsearch disk space is running low + +Elasticsearch is predicted to run out of disk space within the next 6 hours based on current disk usage. Use the following procedure to troubleshoot this alert. + +.Procedure + +. Get the disk space of the Elasticsearch node: ++ +[source,terminal] +---- +$ for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; \ + do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod \ + -- df -h /elasticsearch/persistent; done +---- + +. In the command output, check the `Avail` column to determine the free disk space on that node. ++ +.Example output +[source,terminal] +---- +elasticsearch-cdm-kcrsda6l-1-586cc95d4f-h8zq8 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme1n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-2-5b548fc7b-cwwk7 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme2n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-3-5dfc884d99-59tjw +Filesystem Size Used Avail Use% Mounted on +/dev/nvme3n1 19G 528M 19G 3% /elasticsearch/persistent +---- + +. Increase the disk space on all nodes. If increasing the disk space is not possible, try adding a new data node to the cluster, or decrease the total cluster redundancy policy. + +. To check the current `redundancyPolicy`, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get es elasticsearch -o jsonpath='{.spec.redundancyPolicy}' +---- ++ +If you are using a `ClusterLogging` resource on your cluster, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get cl \ + -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' +---- ++ +If the cluster `redundancyPolicy` value is higher than the `SingleRedundancy` value, set it to the `SingleRedundancy` value and save this change. + +. If the preceding steps do not fix the issue, delete the old indices. +.. Check the status of all indices on Elasticsearch by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME -- indices +---- + +.. Identify an old index that can be deleted. +.. Delete the index by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query= -X DELETE +---- diff --git a/modules/es-node-disk-flood-watermark-reached.adoc b/modules/es-node-disk-flood-watermark-reached.adoc new file mode 100644 index 000000000000..af81e14a8079 --- /dev/null +++ b/modules/es-node-disk-flood-watermark-reached.adoc @@ -0,0 +1,89 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/troubleshooting-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="es-node-disk-flood-watermark-reached_{context}"] += Elasticsearch node disk flood watermark reached + +Elasticsearch enforces a read-only index block on every index that has both of these conditions: + +* One or more shards are allocated to the node. +* One or more disks exceed the https://www.elastic.co/guide/en/elasticsearch/reference/6.8/disk-allocator.html[flood stage]. + +Use the following procedure to troubleshoot this alert. + +include::snippets/es-pod-var-logging.adoc[] + +.Procedure + +. Get the disk space of the Elasticsearch node: ++ +[source,terminal] +---- +$ for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; \ + do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod \ + -- df -h /elasticsearch/persistent; done +---- + +. In the command output, check the `Avail` column to determine the free disk space on that node. ++ +.Example output +[source,terminal] +---- +elasticsearch-cdm-kcrsda6l-1-586cc95d4f-h8zq8 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme1n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-2-5b548fc7b-cwwk7 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme2n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-3-5dfc884d99-59tjw +Filesystem Size Used Avail Use% Mounted on +/dev/nvme3n1 19G 528M 19G 3% /elasticsearch/persistent +---- + +. Increase the disk space on all nodes. If increasing the disk space is not possible, try adding a new data node to the cluster, or decrease the total cluster redundancy policy. + +. To check the current `redundancyPolicy`, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get es elasticsearch \ + -o jsonpath='{.spec.redundancyPolicy}' +---- ++ +If you are using a `ClusterLogging` resource on your cluster, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get cl \ + -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' +---- ++ +If the cluster `redundancyPolicy` value is higher than the `SingleRedundancy` value, set it to the `SingleRedundancy` value and save this change. + +. If the preceding steps do not fix the issue, delete the old indices. +.. Check the status of all indices on Elasticsearch by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME -- indices +---- + +.. Identify an old index that can be deleted. +.. Delete the index by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query= -X DELETE +---- + +. Continue freeing up and monitoring the disk space. After the used disk space drops below 90%, unblock writing to this node by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_all/_settings?pretty \ + -X PUT -d '{"index.blocks.read_only_allow_delete": null}' +---- diff --git a/modules/es-node-disk-high-watermark-reached.adoc b/modules/es-node-disk-high-watermark-reached.adoc new file mode 100644 index 000000000000..4c8a1fd29e45 --- /dev/null +++ b/modules/es-node-disk-high-watermark-reached.adoc @@ -0,0 +1,78 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/troubleshooting-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="es-node-disk-high-watermark-reached_{context}"] += Elasticsearch node disk high watermark reached + +Elasticsearch attempts to relocate shards away from a node that has reached the high watermark to a node with low disk usage that has not crossed any watermark threshold limits. + +To allocate shards to a particular node, you must free up some space on that node. If increasing the disk space is not possible, try adding a new data node to the cluster, or decrease the total cluster redundancy policy. + +include::snippets/es-pod-var-logging.adoc[] + +.Procedure + +. Identify the node on which Elasticsearch is deployed by running the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get po -o wide +---- + +. Check the disk space on each node: ++ +[source,terminal] +---- +$ for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; \ + do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod \ + -- df -h /elasticsearch/persistent; done +---- + +. Check if the cluster is rebalancing: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cluster/health?pretty | grep relocating_shards +---- ++ +If the command output shows relocating shards, the high watermark has been exceeded. The default value of the high watermark is 90%. + +. Increase the disk space on all nodes. If increasing the disk space is not possible, try adding a new data node to the cluster, or decrease the total cluster redundancy policy. + +. To check the current `redundancyPolicy`, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get es elasticsearch \ + -o jsonpath='{.spec.redundancyPolicy}' +---- ++ +If you are using a `ClusterLogging` resource on your cluster, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get cl \ + -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' +---- ++ +If the cluster `redundancyPolicy` value is higher than the `SingleRedundancy` value, set it to the `SingleRedundancy` value and save this change. + +. If the preceding steps do not fix the issue, delete the old indices. +.. Check the status of all indices on Elasticsearch by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME -- indices +---- + +.. Identify an old index that can be deleted. +.. Delete the index by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query= -X DELETE +---- diff --git a/modules/es-node-disk-low-watermark-reached.adoc b/modules/es-node-disk-low-watermark-reached.adoc new file mode 100644 index 000000000000..825215001833 --- /dev/null +++ b/modules/es-node-disk-low-watermark-reached.adoc @@ -0,0 +1,90 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/troubleshooting-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="es-node-disk-low-watermark-reached_{context}"] += Elasticsearch node disk low watermark reached + +Elasticsearch does not allocate shards to nodes that reach the low watermark. + +include::snippets/es-pod-var-logging.adoc[] + +.Procedure + +. Identify the node on which Elasticsearch is deployed by running the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get po -o wide +---- + +. Check if there are unassigned shards by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query=_cluster/health?pretty | grep unassigned_shards +---- + +. If there are unassigned shards, check the disk space on each node, by running the following command: ++ +[source,terminal] +---- +$ for pod in `oc -n openshift-logging get po -l component=elasticsearch -o jsonpath='{.items[*].metadata.name}'`; \ + do echo $pod; oc -n openshift-logging exec -c elasticsearch $pod \ + -- df -h /elasticsearch/persistent; done +---- + +. In the command output, check the `Use` column to determine the used disk percentage on that node. ++ +.Example output +[source,terminal] +---- +elasticsearch-cdm-kcrsda6l-1-586cc95d4f-h8zq8 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme1n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-2-5b548fc7b-cwwk7 +Filesystem Size Used Avail Use% Mounted on +/dev/nvme2n1 19G 522M 19G 3% /elasticsearch/persistent +elasticsearch-cdm-kcrsda6l-3-5dfc884d99-59tjw +Filesystem Size Used Avail Use% Mounted on +/dev/nvme3n1 19G 528M 19G 3% /elasticsearch/persistent +---- ++ +If the used disk percentage is above 85%, the node has exceeded the low watermark, and shards can no longer be allocated to this node. + +. To check the current `redundancyPolicy`, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get es elasticsearch \ + -o jsonpath='{.spec.redundancyPolicy}' +---- ++ +If you are using a `ClusterLogging` resource on your cluster, run the following command: ++ +[source,terminal] +---- +$ oc -n openshift-logging get cl \ + -o jsonpath='{.items[*].spec.logStore.elasticsearch.redundancyPolicy}' +---- ++ +If the cluster `redundancyPolicy` value is higher than the `SingleRedundancy` value, set it to the `SingleRedundancy` value and save this change. + +. If the preceding steps do not fix the issue, delete the old indices. +.. Check the status of all indices on Elasticsearch by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME -- indices +---- + +.. Identify an old index that can be deleted. +.. Delete the index by running the following command: ++ +[source,terminal] +---- +$ oc exec -n openshift-logging -c elasticsearch $ES_POD_NAME \ + -- es_util --query= -X DELETE +---- diff --git a/modules/logging-loki-alerts.adoc b/modules/logging-enabling-loki-alerts.adoc similarity index 75% rename from modules/logging-loki-alerts.adoc rename to modules/logging-enabling-loki-alerts.adoc index 097c38a2f2e9..c35685fd7683 100644 --- a/modules/logging-loki-alerts.adoc +++ b/modules/logging-enabling-loki-alerts.adoc @@ -1,31 +1,27 @@ // Module included in the following assemblies: -// logging-5-7-configuration +// +// logging/logging_alerts/custom-logging-alerts.adoc :_content-type: PROCEDURE -[id="logging-loki-alerts_{context}"] -= Enabling log based alerts with Loki -Loki alerting rules use link:https://grafana.com/docs/loki/latest/logql/[LogQL] and follow link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules[Prometheus formatting]. You can set log based alerts by creating an `AlertingRule` custom resource (CR). `AlertingRule` CRs may be created for `application`, `audit`, or `infrastructure` tenants. +[id="logging-enabling-loki-alerts_{context}"] += Creating a log-based alerting rule with Loki + +The `AlertingRule` CR contains a set of specifications and webhook validation definitions to declare groups of alerting rules for a single `LokiStack` instance. In addition, the webhook validation definition provides support for rule validation conditions: + +* If an `AlertingRule` CR includes an invalid `interval` period, it is an invalid alerting rule +* If an `AlertingRule` CR includes an invalid `for` period, it is an invalid alerting rule. +* If an `AlertingRule` CR includes an invalid LogQL `expr`, it is an invalid alerting rule. +* If an `AlertingRule` CR includes two groups with the same name, it is an invalid alerting rule. +* If none of above applies, an alerting rule is considered valid. [options="header"] |================================================ -| Tenant type | Valid namespaces +| Tenant type | Valid namespaces for `AlertingRule` CRs | application | | audit | `openshift-logging` | infrastructure | `openshift-/\*`, `kube-/\*`, `default` |================================================ -Application, Audit, and Infrastructure alerts are sent to the Cluster Monitoring Operator (CMO) Alertmanager in the `openshift-monitoring` namespace by default unless you have disabled the local `Alertmanager` instance. - -Application alerts are not sent to the CMO Alertmanager in the `openshift-user-workload-monitoring` namespace by default unless you have enabled a separate `Alertmanager` instance. - -The `AlertingRule` CR contains a set of specifications and webhook validation definitions to declare groups of alerting rules for a single LokiStack instance. In addition, the webhook validation definition provides support for rule validation conditions: - -* If an `AlertingRule` CR includes an invalid `interval` period, it is an invalid alerting rule -* If an `AlertingRule` CR includes an invalid `for` period, it is an invalid alerting rule. -* If an `AlertingRule` CR includes an invalid LogQL `expr`, it is an invalid alerting rule. -* If an `AlertingRule` CR includes two groups with the same name, it is an invalid alerting rule. -* If none of above applies, an `AlertingRule` is considered a valid alerting rule. - .Prerequisites * {logging-title-uc} Operator 5.7 and later @@ -44,7 +40,7 @@ The `AlertingRule` CR contains a set of specifications and webhook validation de name: loki-operator-alerts namespace: openshift-operators-redhat <1> labels: <2> - openshift.io/cluster-monitoring: "true" + openshift.io/: "true" spec: tenantID: "infrastructure" <3> groups: @@ -80,7 +76,7 @@ The `AlertingRule` CR contains a set of specifications and webhook validation de name: app-user-workload namespace: app-ns <1> labels: <2> - openshift.io/cluster-monitoring: "true" + openshift.io/: "true" spec: tenantID: "application" groups: diff --git a/modules/cluster-logging-collector-alerts.adoc b/modules/logging-fluentd-collector-alerts.adoc similarity index 57% rename from modules/cluster-logging-collector-alerts.adoc rename to modules/logging-fluentd-collector-alerts.adoc index 453c2076b96c..dcdb0c4f1f9b 100644 --- a/modules/cluster-logging-collector-alerts.adoc +++ b/modules/logging-fluentd-collector-alerts.adoc @@ -1,21 +1,14 @@ // Module included in the following assemblies: // -// * logging/cluster-logging-collector.adoc - -:_content-type: CONCEPT -[id="cluster-logging-collector-alerts_{context}"] -= About logging collector alerts - -The following alerts are generated by the logging collector. You can view these alerts in the -ifndef::openshift-rosa,openshift-dedicated[] -{product-title} web console -endif::[] -ifdef::openshift-rosa,openshift-dedicated[] -{cluster-manager-url} -endif::[] -on the *Alerts* page of the Alerting UI. - -.Fluentd Prometheus alerts +// * logging/logging_alerts/default-logging-alerts.adoc + +:_content-type: REFERENCE +[id="logging-fluentd-collector-alerts_{context}"] += Fluentd collector alerts + +The following alerts are generated by the legacy Fluentd log collector. You can view these alerts in the {product-title} web console. + +.Fluentd collector alerts [cols="2,2,2,1",options="header"] |=== |Alert |Message |Description |Severity @@ -31,9 +24,9 @@ on the *Alerts* page of the Alerting UI. |Critical |`FluentdQueueLengthIncreasing` -|`In the last 12h, fluentd buffer queue length constantly increased more than 1. Current value is .` +|`In the last 1h, fluentd buffer queue length constantly increased more than 1. Current value is .` |Fluentd is reporting that the queue size is increasing. -|Critical +|Warning |`FluentDVeryHighErrorRate` |` of records have resulted in an error by fluentd .` diff --git a/modules/logging-vector-collector-alerts.adoc b/modules/logging-vector-collector-alerts.adoc new file mode 100644 index 000000000000..312023f4373c --- /dev/null +++ b/modules/logging-vector-collector-alerts.adoc @@ -0,0 +1,36 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/default-logging-alerts.adoc + +:_content-type: REFERENCE +[id="logging-vector-collector-alerts_{context}"] += Vector collector alerts + +In logging 5.7 and later versions, the following alerts are generated by the Vector collector. You can view these alerts in the {product-title} web console. + +.Vector collector alerts +[cols="2,2,2,1",options="header"] +|=== +|Alert |Message |Description |Severity + +|`CollectorHighErrorRate` +|` of records have resulted in an error by vector .` +|The number of vector output errors is high, by default more than 10 in the previous 15 minutes. +|Warning + +|`CollectorNodeDown` +|`Prometheus could not scrape vector for more than 10m.` +|Vector is reporting that Prometheus could not scrape a specific Vector instance. +|Critical + +|`CollectorVeryHighErrorRate` +|` of records have resulted in an error by vector .` +|The number of Vector component errors are very high, by default more than 25 in the previous 15 minutes. +|Critical + +|`FluentdQueueLengthIncreasing` +|`In the last 1h, fluentd buffer queue length constantly increased more than 1. Current value is .` +|Fluentd is reporting that the queue size is increasing. +|Warning + +|=== diff --git a/modules/loki-rbac-permissions.adoc b/modules/loki-rbac-permissions.adoc new file mode 100644 index 000000000000..badcb23981d5 --- /dev/null +++ b/modules/loki-rbac-permissions.adoc @@ -0,0 +1,28 @@ +// Module included in the following assemblies: +// +// * logging/logging_alerts/custom-logging-alerts.adoc + +:_content-type: PROCEDURE +[id="loki-rbac-permissions_{context}"] += Authorizing Loki rules RBAC permissions + +// May need to re-add this after 5.8 release - check with eng later +// In logging 5.7 and later, the Cluster Logging Operator provides `alertingrule-editor-role` and `recordingrule-editor-role` cluster roles, which enable users to modify alerting and recording rules for the LokiStack. + +Administrators can allow users to create and manage their own alerting rules by creating a `ClusterRole` object and binding this role to usernames. The `ClusterRole` object defines the necessary role-based access control (RBAC) permissions for users. + +.Prerequisites + +* The Cluster Logging Operator is installed in the `openshift-logging` namespace. +* You have administrator permissions. + +.Procedure + +. Create a cluster role that defines the necessary RBAC permissions. +. Bind the appropriate cluster roles to the username: ++ +.Example binding command +[source,terminal] +---- +$ oc adm policy add-role-to-user -n +---- diff --git a/modules/monitoring-accessing-the-alerting-ui.adoc b/modules/monitoring-accessing-the-alerting-ui.adoc index f961c1d0e461..cbc4787bebd3 100644 --- a/modules/monitoring-accessing-the-alerting-ui.adoc +++ b/modules/monitoring-accessing-the-alerting-ui.adoc @@ -1,9 +1,10 @@ // Module included in the following assemblies: // // * monitoring/managing-alerts.adoc +// * logging/logging_alerts/log-storage-alerts.adoc :_content-type: PROCEDURE -[id="accessing_the_alerting_ui_{context}"] +[id="monitoring-accessing-the-alerting-ui_{context}"] = Accessing the Alerting UI in the Administrator and Developer perspectives The Alerting UI is accessible through the Administrator perspective and the Developer perspective in the {product-title} web console. diff --git a/snippets/es-pod-var-logging.adoc b/snippets/es-pod-var-logging.adoc new file mode 100644 index 000000000000..9eb83e31ae4b --- /dev/null +++ b/snippets/es-pod-var-logging.adoc @@ -0,0 +1,31 @@ +// Snippet included in the following assemblies: +// +// +// Snippet included in the following modules: +// +// * es-node-disk-low-watermark-reached.adoc +// * es-node-disk-high-watermark-reached.adoc +// * es-node-disk-flood-watermark-reached.adoc + +:_content-type: SNIPPET + +[TIP] +==== +Some commands in this documentation reference an Elasticsearch pod by using a `$ES_POD_NAME` shell variable. If you want to copy and paste the commands directly from this documentation, you must set this variable to a value that is valid for your Elasticsearch cluster. + +You can list the available Elasticsearch pods by running the following command: + +[source,terminal] +---- +$ oc -n openshift-logging get pods -l component=elasticsearch +---- + +Choose one of the pods listed and set the `$ES_POD_NAME` variable, by running the following command: + +[source,terminal] +---- +$ export ES_POD_NAME= +---- + +You can now use the `$ES_POD_NAME` variable in commands. +==== diff --git a/support/index.adoc b/support/index.adoc index ff377f7c838a..48ee53d5003f 100644 --- a/support/index.adoc +++ b/support/index.adoc @@ -129,7 +129,7 @@ endif::openshift-rosa,openshift-dedicated[] ** xref:../logging/troubleshooting/cluster-logging-cluster-status.adoc#cluster-logging-cluster-status[Status of the Logging Operator]. ** xref:../logging/troubleshooting/cluster-logging-cluster-status.adoc#cluster-logging-cluster-status[Status of the Log store]. -** xref:../logging/troubleshooting/cluster-logging-alerts.adoc#cluster-logging-alerts[OpenShift Logging alerts]. +** xref:../logging/logging_alerts/troubleshooting-logging-alerts.adoc#troubleshooting-logging-alerts[Troubleshooting logging alerts]. ** xref:../logging/cluster-logging-support.adoc#cluster-logging-support-must-gather_cluster-logging-support[Information about your OpenShift logging environment using `oc adm must-gather` command]. * xref:../support/troubleshooting/diagnosing-oc-issues.adoc#diagnosing-oc-issues[OpenShift CLI (oc) issues]: Investigate OpenShift CLI (oc) issues by increasing the log level.