From 203fc63025fba6b4c631367c2e05b7dade1c28c1 Mon Sep 17 00:00:00 2001 From: Neal Alhadeff Date: Wed, 11 May 2022 15:08:16 -0400 Subject: [PATCH] bz2076315: making bug revisions --- modules/graceful-shutdown.adoc | 104 ++++++++------------------------- 1 file changed, 25 insertions(+), 79 deletions(-) diff --git a/modules/graceful-shutdown.adoc b/modules/graceful-shutdown.adoc index 8f7199a21804..1b52a0f10ef4 100644 --- a/modules/graceful-shutdown.adoc +++ b/modules/graceful-shutdown.adoc @@ -33,110 +33,56 @@ If your cluster fails to recover, follow the steps to restore to a previous clus .Procedure -. If you plan to shut down the cluster for an extended period of time, determine the date that cluster certificates expire. -+ -You must restart the cluster prior to the date that certificates expire. As the cluster restarts, the process might require you to manually approve the pending certificate signing requests (CSRs) to recover kubelet certificates. - -.. Check the expiration date for the `kube-apiserver-to-kubelet-signer` CA certificate: +. If you are shutting the cluster down for an extended period, determine the date on which certificates expire. + [source,terminal] ---- -$ oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}{"\n"}' +$ oc -n openshift-kube-apiserver-operator get secret kube-apiserver-to-kubelet-signer -o jsonpath='{.metadata.annotations.auth\.openshift\.io/certificate-not-after}' ---- + .Example output -[source,terminal] ----- -2023-08-05T14:37:50Z ----- - -.. Check the expiration date for the kubelet certificates: - -... Start a debug session for a control plane node by running the following command: -+ -[source,terminal] ---- -$ oc debug node/ +2022-08-05T14:37:50Zuser@user:~ $ <1> ---- +<1> To ensure that the cluster can restart gracefully, plan to restart it on or before the specified date. As the cluster restarts, the process might require you to manually approve the pending certificate signing requests (CSRs) to recover kubelet certificates. -... Change your root directory to `/host` by running the following command: +. Mark the control plane nodes in the cluster as unschedulable. You can do this from your cloud provider's web console, or run the following loop: + -[source,terminal] +[source,yaml] ---- -sh-4.4# chroot /host ----- - -... Check the kubelet client certificate expiration date by running the following command: -+ -[source,terminal] ----- -sh-5.1# openssl x509 -in /var/lib/kubelet/pki/kubelet-client-current.pem -noout -enddate +$ for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do echo ${node} ; oc adm cordon ${node} ; done ---- + .Example output [source,terminal] ---- -notAfter=Jun 6 10:50:07 2023 GMT +ci-ln-mgdnf4b-72292-n547t-master-0 +node/ci-ln-mgdnf4b-72292-n547t-master-0 cordoned +ci-ln-mgdnf4b-72292-n547t-master-1 +node/ci-ln-mgdnf4b-72292-n547t-master-1 cordoned +ci-ln-mgdnf4b-72292-n547t-master-2 +node/ci-ln-mgdnf4b-72292-n547t-master-2 cordoned +ci-ln-mgdnf4b-72292-n547t-worker-a-s7ntl +node/ci-ln-mgdnf4b-72292-n547t-worker-a-s7ntl cordoned +ci-ln-mgdnf4b-72292-n547t-worker-b-cmc9k +node/ci-ln-mgdnf4b-72292-n547t-worker-b-cmc9k cordoned +ci-ln-mgdnf4b-72292-n547t-worker-c-vcmtn +node/ci-ln-mgdnf4b-72292-n547t-worker-c-vcmtn cordoned ---- ... Check the kubelet server certificate expiration date by running the following command: + -[source,terminal] ----- -sh-5.1# openssl x509 -in /var/lib/kubelet/pki/kubelet-server-current.pem -noout -enddate ----- +. Evacuate the pods using the following method: + -.Example output -[source,terminal] +[source,yaml] ---- -notAfter=Jun 6 10:50:07 2023 GMT +$ for node in $(oc get nodes -l node-role.kubernetes.io/worker -o jsonpath='{.items[*].metadata.name}'); do echo ${node} ; oc adm drain ${node} --delete-emptydir-data --ignore-daemonsets=true --timeout=15s ; done ---- - -... Exit the debug session. - -... Repeat these steps to check certificate expiration dates on all control plane nodes. To ensure that the cluster can restart gracefully, plan to restart it before the earliest certificate expiration date. - -. Shut down all of the nodes in the cluster. You can do this from your cloud provider's web console, or run the following loop: + -[source,terminal] ----- -$ for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do oc debug node/${node} -- chroot /host shutdown -h 1; done <1> ----- -<1> `-h 1` indicates how long, in minutes, this process lasts before the control-plane nodes are shut down. For large-scale clusters with 10 nodes or more, set to 10 minutes or longer to make sure all the compute nodes have time to shut down first. +. Shut down all of the nodes in the cluster. You can do this from your cloud provider’s web console, or run the following loop: + -.Example output ----- -Starting pod/ip-10-0-130-169us-east-2computeinternal-debug ... -To use host binaries, run `chroot /host` -Shutdown scheduled for Mon 2021-09-13 09:36:17 UTC, use 'shutdown -c' to cancel. - -Removing debug pod ... -Starting pod/ip-10-0-150-116us-east-2computeinternal-debug ... -To use host binaries, run `chroot /host` -Shutdown scheduled for Mon 2021-09-13 09:36:29 UTC, use 'shutdown -c' to cancel. ----- -+ -Shutting down the nodes using one of these methods allows pods to terminate gracefully, which reduces the chance for data corruption. -+ -[NOTE] -==== -Adjust the shut down time to be longer for large-scale clusters: -[source,terminal] +[source,yaml] ---- -$ for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do oc debug node/${node} -- chroot /host shutdown -h 10; done +$ for node in $(oc get nodes -o jsonpath='{.items[*].metadata.name}'); do oc debug node/${node} -- chroot /host shutdown -h 1 ; done ---- -==== -+ -[NOTE] -==== -It is not necessary to drain control plane nodes of the standard pods that ship with {product-title} prior to shutdown. -Cluster administrators are responsible for ensuring a clean restart of their own workloads after the cluster is restarted. If you drained control plane nodes prior to shutdown because of custom workloads, you must mark the control plane nodes as schedulable before the cluster will be functional again after restart. -==== - -. Shut off any cluster dependencies that are no longer needed, such as external storage or an LDAP server. Be sure to consult your vendor's documentation before doing so. -+ -[IMPORTANT] -==== -If you deployed your cluster on a cloud-provider platform, do not shut down, suspend, or delete the associated cloud resources. If you delete the cloud resources of a suspended virtual machine, {product-title} might not restore successfully. -====