From 53413032a3f4bfbb92af26ad5f86afe9e445b47c Mon Sep 17 00:00:00 2001 From: Darshan Nagaraj Date: Wed, 25 May 2022 12:50:41 +0530 Subject: [PATCH] CFE-254: Adds Node observability new section --- _topic_maps/_topic_map.yml | 7 +- ...-observability-create-custom-resource.adoc | 97 ++++++++++++++ ...ode-observability-high-level-workflow.adoc | 11 ++ modules/node-observability-install-cli.adoc | 119 ++++++++++++++++++ ...ode-observability-install-web-console.adoc | 31 +++++ modules/node-observability-installation.adoc | 8 ++ ...ode-observability-run-profiling-query.adoc | 83 ++++++++++++ .../node-observability-operator.adoc | 25 ++++ 8 files changed, 379 insertions(+), 2 deletions(-) create mode 100644 modules/node-observability-create-custom-resource.adoc create mode 100644 modules/node-observability-high-level-workflow.adoc create mode 100644 modules/node-observability-install-cli.adoc create mode 100644 modules/node-observability-install-web-console.adoc create mode 100644 modules/node-observability-installation.adoc create mode 100644 modules/node-observability-run-profiling-query.adoc create mode 100644 scalability_and_performance/node-observability-operator.adoc diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index 528f2ea29255..4e96b13f6bd0 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -1230,7 +1230,7 @@ Topics: File: about-advertising-ipaddresspool - Name: Configuring MetalLB BGP peers File: metallb-configure-bgp-peers - - Name: Advertising an IP address pool using the community alias + - Name: Advertising an IP address pool using the community alias File: metallb-configure-community-alias - Name: Configuring MetalLB BFD profiles File: metallb-configure-bfd-profiles @@ -2047,7 +2047,7 @@ Topics: - Name: Enabling features using FeatureGates File: nodes-cluster-enabling-features Distros: openshift-enterprise,openshift-origin - - Name: Improving cluster stability in high latency environments using worker latency profiles + - Name: Improving cluster stability in high latency environments using worker latency profiles File: nodes-cluster-worker-latency-profiles Distros: openshift-enterprise,openshift-origin - Name: Remote worker nodes on the network edge @@ -2271,6 +2271,9 @@ Topics: - Name: Deploying distributed units at scale in a disconnected environment File: ztp-deploying-disconnected Distros: openshift-origin,openshift-enterprise +- Name: Requesting CRI-O and Kubelet profiling data using the Node Observability Operator + File: node-observability-operator + Distros: openshift-origin,openshift-enterprise --- Name: Specialized hardware and driver enablement Dir: hardware_enablement diff --git a/modules/node-observability-create-custom-resource.adoc b/modules/node-observability-create-custom-resource.adoc new file mode 100644 index 000000000000..8635f7e3e982 --- /dev/null +++ b/modules/node-observability-create-custom-resource.adoc @@ -0,0 +1,97 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: PROCEDURE +[id="creating-node-observability-custom-resource_{context}"] += Creating the Node Observability custom resource + +Before you run profiling queries, you must create a `NodeObservability` custom resource (CR). + +[IMPORTANT] +==== +Creating a `NodeObservability` CR reboots all the worker nodes. It might take 10 or more minutes to complete. +==== + +When you apply the `NodeObservability` CR, it creates the necessary machine config and machine config pool CRs to enable the CRI-O profiling on the worker nodes. + +[NOTE] +==== +Kubelet profiling is enabled by default. +==== + +The CRI-O unix socket of the node is mounted on the agent pod, which allows the agent to communicate with CRIO to run the pprof request. Similiarly, the `kubelet-serving-ca` certificate chain is mounted on the agent pod, which allows secure communication between the agent and node's kubelet endpoint. + +.Prerequisites +* You have installed the Node Observability Operator. +* You have installed the OpenShift CLI (oc). +* You have access to the cluster with `cluster-admin` privileges. + +.Procedure + +. Log in to the {product-title} CLI as a user with the `cluster-admin` role by running the following command: ++ +[source,terminal] +---- +$ oc login -u kubeadmin https://:6443 +---- + +. Switch back to the `node-observability-operator` namespace by running the following command: ++ +[source,terminal] +---- +$ oc project node-observability-operator +---- + +. Create a CR file named `nodeobservability.yaml` that contains the following text: ++ +[source,yaml] +---- + apiVersion: nodeobservability.olm.openshift.io/v1alpha1 + kind: NodeObservability + metadata: + name: cluster <1> + spec: + labels: + node-role.kubernetes.io/worker: "" + type: crio-kubelet +---- +<1> You must specify the name as `cluster` because there should be only one `NodeObservability` CR per cluster. + +. Run the `NodeObservability` CR: ++ +[source,terminal] +---- +oc apply -f nodeobservability.yaml +---- + ++ +.Example output +[source,terminal] +---- +nodeobservability.olm.openshift.io/cluster created +---- + +. Review the status of the `NodeObservability` CR by running the following command: ++ +[source,terminal] +---- +$ oc get nob/cluster -o yaml | yq '.status.conditions' +---- + ++ +.Example output +[source,terminal] +---- +conditions: + conditions: + - lastTransitionTime: "2022-07-05T07:33:54Z" + message: 'DaemonSet node-observability-ds ready: true NodeObservabilityMachineConfig + ready: true' + reason: Ready + status: "True" + type: Ready +---- + ++ +`NodeObservability` CR run is completed when the reason is `Ready` and the status is `True`. diff --git a/modules/node-observability-high-level-workflow.adoc b/modules/node-observability-high-level-workflow.adoc new file mode 100644 index 000000000000..53d53ee2e73f --- /dev/null +++ b/modules/node-observability-high-level-workflow.adoc @@ -0,0 +1,11 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: CONCEPT +[id="workflow-node-observability-operator_{context}"] += High level workflow of the Node Observability Operator + +After you install the Node Observability Operator in the {product-title} cluster, you have to create a `NodeObservability` custom resource, which creates a DaemonSet to deploy a Node Observability agent on each worker node. + +To request a profiling query, you have to create a `NodeObservabilityRun` resource that requests the deployed Node Observability agent to trigger the CRI-O and Kubelet profiling. After the profiling is completed, the Node Observability agent stores the profiling data inside the container file system `/run/node-observability` directory, which is available for query. diff --git a/modules/node-observability-install-cli.adoc b/modules/node-observability-install-cli.adoc new file mode 100644 index 000000000000..ed5d4276a81f --- /dev/null +++ b/modules/node-observability-install-cli.adoc @@ -0,0 +1,119 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: PROCEDURE +[id="install-node-observability-using-cli_{context}"] += Installing the Node Observability Operator using the CLI + +You can install the Node Observability Operator by using the OpenShift CLI (oc). + +.Prerequisites + +* You have installed the OpenShift CLI (oc). +* You have access to the cluster with `cluster-admin` privileges. + +.Procedure + +. Confirm that the Node Observability Operator is available by running the following command: ++ +[source,terminal] +---- +$ oc get packagemanifests -n openshift-marketplace node-observability-operator +---- + ++ +.Example output +[source,terminal] +---- +NAME CATALOG AGE +node-observability-operator Red Hat Operators 9h +---- + +. Create the `node-observability-operator` namespace by running the following command:: ++ +[source,terminal] +---- +$ oc new-project node-observability-operator +---- + +. Create an `OperatorGroup` object YAML file: ++ +[source,yaml] +---- +cat < -o yaml | yq '.status.phase' +---- ++ +`` is the install plan name that you obtained from the output of the previous command. + ++ +.Example output +[source,terminal] +---- +COMPLETE +---- + +. Verify that the Node Observability Operator is up and running: ++ +[source,terminal] +---- +$ oc get deploy -n node-observability-operator +---- + ++ +.Example output +[source,terminal] +---- +NAME READY UP-TO-DATE AVAILABLE AGE +node-observability-operator-controller-manager 1/1 1 1 40h +---- diff --git a/modules/node-observability-install-web-console.adoc b/modules/node-observability-install-web-console.adoc new file mode 100644 index 000000000000..40fd247c25bb --- /dev/null +++ b/modules/node-observability-install-web-console.adoc @@ -0,0 +1,31 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: PROCEDURE +[id="install-node-observability-using-web-console_{context}"] += Installing the Node Observability Operator using the web console + +You can install the Node Observability Operator from the {product-title} web console. + +.Prerequisites + +* You have access to the cluster with `cluster-admin` privileges. +* You have access to the {product-title} web console. + +.Procedure + +. Log in to the {product-title} web console. +. In the Administrator's navigation panel, expand *Operators* → *OperatorHub*. +. In the *All items* field, enter *Node Observability Operator* and select the *Node Observability Operator* tile. +. Click *Install*. +. On the *Install Operator* page, configure the following settings: +.. In the *Update channel* area, click *alpha*. +.. In the *Installation mode* area, click *A specific namespace on the cluster*. +.. From the *Installed Namespace* list, select *node-observability-operator* from the list. +.. In the *Update approval* area, select *Automatic*. +.. Click *Install*. + +.Verification +. In the Administrator's navigation panel, expand *Operators* → *Installed Operators*. +. Verify that the Node Observability Operator is listed in the Operators list. diff --git a/modules/node-observability-installation.adoc b/modules/node-observability-installation.adoc new file mode 100644 index 000000000000..245faa8664a4 --- /dev/null +++ b/modules/node-observability-installation.adoc @@ -0,0 +1,8 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: CONCEPT +[id="install-node-observability-operator_{context}"] += Installing the Node Observability Operator +The Node Observability Operator is not installed in {product-title} by default. You can install the Node Observability Operator by using the {product-title} CLI or the web console. diff --git a/modules/node-observability-run-profiling-query.adoc b/modules/node-observability-run-profiling-query.adoc new file mode 100644 index 000000000000..36f08afa93d6 --- /dev/null +++ b/modules/node-observability-run-profiling-query.adoc @@ -0,0 +1,83 @@ +// Module included in the following assemblies: +// +// * scalability_and_performance/understanding-node-observability-operator.adoc + +:_content-type: PROCEDURE +[id="running-profiling-query_{context}"] += Running profiling query + +Profiling query is a blocking operation that fetches CRI-O and Kubelet profiling data for a duration of 30 seconds. The Node Observability Operator stores the profiling data inside the container file system `/run/node-observability` directory. To request profiling data query, you have to create a `NodeObservabilityRun` resource. + +[IMPORTANT] +==== +You can request only one profiling query at any point of time. +==== + +.Prerequisites +* You have installed the Node Observability Operator. +* You have created the `NodeObservability` custom resource (CR). +* You have access to the cluster with `cluster-admin` privileges. + +.Procedure + +. Create a `NodeObservabilityRun` resource file named `nodeobservabilityrun.yaml` that contains the following text: ++ +[source,yaml] +---- +apiVersion: nodeobservability.olm.openshift.io/v1alpha1 +kind: NodeObservabilityRun +metadata: + name: nodeobservabilityrun +spec: + nodeObservabilityRef: + name: cluster +---- + +. Run the `NodeObservabilityRun` to trigger the profiling: ++ +[source,terminal] +---- +$ oc apply -f nodeobservabilityrun.yaml +---- + +. Review the status of the `NodeObservabilityRun` by running the following command: ++ +[source,terminal] +---- +$ oc get nodeobservabilityrun -o yaml | yq '.status.conditions' +---- + ++ +.Example output +[source,terminal] +---- +conditions: +- lastTransitionTime: "2022-07-07T14:57:34Z" + message: Ready to start profiling + reason: Ready + status: "True" + type: Ready +- lastTransitionTime: "2022-07-07T14:58:10Z" + message: Profiling query done + reason: Finished + status: "True" + type: Finished +---- + ++ +Profiling query is complete when the status is `True` and type is `Finished`. + +. Run the following bash script to retrieve the profiling data from container's `/run/node-observability` path: ++ +[source,bash] +---- +for a in $(oc get nodeobservabilityrun nodeobservabilityrun -o yaml | yq .status.agents[].name); do + echo "agent ${a}" + mkdir -p "/tmp/${a}" + for p in $(oc exec "${a}" -c node-observability-agent -- bash -c "ls /run/node-observability/*.pprof"); do + f="$(basename ${p})" + echo "copying ${f} to /tmp/${a}/${f}" + oc exec "${a}" -c node-observability-agent -- cat "${p}" > "/tmp/${a}/${f}" + done +done +---- diff --git a/scalability_and_performance/node-observability-operator.adoc b/scalability_and_performance/node-observability-operator.adoc new file mode 100644 index 000000000000..e387933806db --- /dev/null +++ b/scalability_and_performance/node-observability-operator.adoc @@ -0,0 +1,25 @@ +:_content-type: ASSEMBLY +[id="using-node-observability-operator"] += Understanding the Node Observability Operator +include::_attributes/common-attributes.adoc[] +:context: node-observability-operator + +toc::[] + + +:FeatureName: The Node Observability Operator +include::snippets/technology-preview.adoc[leveloffset=+0] + +The Node Observability Operator collects and stores the CRI-O and Kubelet profiling data of worker nodes. You can use the profiling data to analyze the CRI-O and Kublet performance trends and debug the performance related issues. + +include::modules/node-observability-high-level-workflow.adoc[leveloffset=+1] + +include::modules/node-observability-installation.adoc[leveloffset=+1] + +include::modules/node-observability-install-cli.adoc[leveloffset=+2] + +include::modules/node-observability-install-web-console.adoc[leveloffset=+2] + +include::modules/node-observability-create-custom-resource.adoc[leveloffset=+1] + +include::modules/node-observability-run-profiling-query.adoc[leveloffset=+1]