From acb506ddb0af53c6cf161c8fb06443a67cfc0090 Mon Sep 17 00:00:00 2001
From: Glenn Matthews <glenn.matthews@networktocode.com>
Date: Fri, 22 Mar 2024 13:03:45 -0400
Subject: [PATCH] Add documentation about docker-compose/k8s health checks
 (#5449)

--------

Co-authored-by: Hanlin Miao <46973263+HanlinMiao@users.noreply.github.com>
Co-authored-by: Gary Snider <75227981+gsnider2195@users.noreply.github.com>
---
 changes/5340.documentation                    |   1 +
 development/dev.env                           |   1 +
 development/docker-compose.mysql.yml          |   8 +-
 development/docker-compose.postgres.yml       |   7 +-
 development/docker-compose.yml                |  33 +-
 mkdocs.yml                                    |   1 +
 .../administration/installation/docker.md     |   3 +
 .../installation/health-checks.md             | 298 ++++++++++++++++++
 8 files changed, 325 insertions(+), 27 deletions(-)
 create mode 100644 changes/5340.documentation
 create mode 100644 nautobot/docs/user-guide/administration/installation/health-checks.md

diff --git a/changes/5340.documentation b/changes/5340.documentation
new file mode 100644
index 0000000000..62e11a2d9c
--- /dev/null
+++ b/changes/5340.documentation
@@ -0,0 +1 @@
+Added installation documentation about recommended health-checks for Docker Compose and Kubernetes.
diff --git a/development/dev.env b/development/dev.env
index f6e0e6a32c..6287fbfd41 100644
--- a/development/dev.env
+++ b/development/dev.env
@@ -28,6 +28,7 @@ POSTGRES_USER=nautobot
 
 # Needed for Redis, must match the values for Nautobot above
 REDIS_PASSWORD=decinablesprewad
+REDISCLI_AUTH=$REDIS_PASSWORD
 
 # Needed for Selenium integration tests
 # WebDriver (Selenium client)
diff --git a/development/docker-compose.mysql.yml b/development/docker-compose.mysql.yml
index 32623a60af..37792c20b6 100644
--- a/development/docker-compose.mysql.yml
+++ b/development/docker-compose.mysql.yml
@@ -19,10 +19,10 @@ services:
       - ./mysql-unittests.sql:/docker-entrypoint-initdb.d/mysql-unittests.sql
       - mysqldata_nautobot:/var/lib/mysql
     healthcheck:
-      test: mysql -h localhost -u $$MYSQL_USER --password=$$MYSQL_PASSWORD --execute "SHOW DATABASES;"
-      start_period: 5s
-      interval: 5s
+      interval: 10s
       timeout: 5s
-      retries: 50
+      start_period: 30s
+      retries: 3
+      test: 'mysql -h localhost -u $$MYSQL_USER --password=$$MYSQL_PASSWORD --execute "SHOW DATABASES;"'
 volumes:
   mysqldata_nautobot:
diff --git a/development/docker-compose.postgres.yml b/development/docker-compose.postgres.yml
index 7b3ebffb29..8481873889 100644
--- a/development/docker-compose.postgres.yml
+++ b/development/docker-compose.postgres.yml
@@ -8,9 +8,10 @@ services:
     volumes:
       - pgdata_nautobot:/var/lib/postgresql/data
     healthcheck:
-      test: ["CMD-SHELL", "pg_isready -d $$POSTGRES_DB -U $$POSTGRES_USER"]
-      interval: 5s
+      interval: 10s
       timeout: 5s
-      retries: 50
+      start_period: 30s
+      retries: 3
+      test: "pg_isready -d $$POSTGRES_DB -U $$POSTGRES_USER"
 volumes:
   pgdata_nautobot:
diff --git a/development/docker-compose.yml b/development/docker-compose.yml
index 74e6072e93..d2a205d120 100644
--- a/development/docker-compose.yml
+++ b/development/docker-compose.yml
@@ -26,15 +26,11 @@ services:
       - dev.env
     tty: true
     healthcheck:
-      interval: 5s
-      timeout: 5s
+      interval: 10s
+      timeout: 10s
       start_period: 5m  # it takes a WHILE to run initial migrations with an empty DB
       retries: 3
-      test:
-        - "CMD"
-        - "curl"
-        - "-f"
-        - "http://localhost:8080/health/"
+      test: "nautobot-server health_check"
   celery_worker:
     image: "local/nautobot-dev:local-py${PYTHON_VER}"
     ports:
@@ -44,14 +40,10 @@ services:
     entrypoint: "watchmedo auto-restart --directory './' --pattern '*.py' --recursive -- nautobot-server celery worker -l INFO --events"
     healthcheck:
       interval: 60s
-      timeout: 30s
+      timeout: 10s
       start_period: 30s
       retries: 3
-      test:
-        - "CMD"
-        - "bash"
-        - "-c"
-        - "nautobot-server celery inspect ping --destination celery@$$HOSTNAME"
+      test: "nautobot-server celery inspect ping --destination celery@$$HOSTNAME"
     depends_on:
       nautobot:
         condition: service_healthy
@@ -68,12 +60,8 @@ services:
       timeout: 5s
       start_period: 30s
       retries: 3
-      test:
-        - "CMD"
-        - "/bin/sh"
-        - "-c"
-        # find the heartbeat file and report success if it was modified less than 0.1 minutes (6 seconds) ago, else fail
-        - '[ $$(find /tmp/nautobot_celery_beat_heartbeat -mmin -0.1 | wc -l) -eq 1 ] || false'
+      # find the heartbeat file and report success if it was modified less than 0.1 minutes (6 seconds) ago, else fail
+      test: "[ $$(find /tmp/nautobot_celery_beat_heartbeat -mmin -0.1 | wc -l) -eq 1 ] || false"
     depends_on:
       nautobot:
         condition: service_healthy
@@ -88,8 +76,13 @@ services:
       - sh
       - -c # this is to evaluate the $REDIS_PASSWORD from the env
       - redis-server --appendonly yes --requirepass $$REDIS_PASSWORD ## $$ because of docker-compose
+    healthcheck:
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      test: "redis-cli -h localhost ping | grep PONG"
     env_file:
-      - dev.env
+      - ./dev.env
   selenium:
     image: selenium/standalone-firefox:4.9.1
     ports:
diff --git a/mkdocs.yml b/mkdocs.yml
index d3ee864ccd..e819323b18 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -280,6 +280,7 @@ nav:
         - External Authentication (Optional): "user-guide/administration/installation/external-authentication.md"
         - Installing and Using Plugins: "user-guide/administration/installation/app-install.md"
         - Nautobot Docker Images: "user-guide/administration/installation/docker.md"
+        - Health Checks: "user-guide/administration/installation/health-checks.md"
         - SELinux Troubleshooting: "user-guide/administration/installation/selinux-troubleshooting.md"
       - Upgrading:
         - Database Backup: "user-guide/administration/upgrading/database-backup.md"
diff --git a/nautobot/docs/user-guide/administration/installation/docker.md b/nautobot/docs/user-guide/administration/installation/docker.md
index 274cb2e5a7..e72374fe1d 100644
--- a/nautobot/docs/user-guide/administration/installation/docker.md
+++ b/nautobot/docs/user-guide/administration/installation/docker.md
@@ -76,6 +76,9 @@ In addition to all tags described in the previous section, the following additio
 
 Nautobot requires a MySQL or PostgreSQL database and Redis instance before it will start. Because of this the quickest and easiest way to get Nautobot running is with [Docker Compose](https://docs.docker.com/compose/), which will install and configure PostgreSQL and Redis containers for you automatically.
 
+!!! tip
+    Whether you're using the Docker CLI, Docker Compose, or [Kubernetes](https://kubernetes.io/), in any case you'll want to set up appropriate [health checks](health-checks.md) for your containers.
+
 ## Configuration
 
 Most configuration parameters are available via environment variables which can be passed to the container. If you desire you can inject your own `nautobot_config.py` by overriding `/opt/nautobot/nautobot_config.py` using [docker volumes](https://docs.docker.com/storage/volumes/) by adding `-v /local/path/to/custom/nautobot_config.py:/opt/nautobot/nautobot_config.py` to your docker run command, for example:
diff --git a/nautobot/docs/user-guide/administration/installation/health-checks.md b/nautobot/docs/user-guide/administration/installation/health-checks.md
new file mode 100644
index 0000000000..6d575b3a0a
--- /dev/null
+++ b/nautobot/docs/user-guide/administration/installation/health-checks.md
@@ -0,0 +1,298 @@
+# Nautobot Health Checks
+
+In a production deployment of Nautobot, you'll want health checks (also termed liveness checks, readiness checks, etc.) for each distinct component of the Nautobot system, to be able to detect if any component fails and ideally respond automatically. While this topic can be (and is) the subject of multiple books, this document attempts to provide some basic "best practices" guidelines. If you're deploying Nautobot as part of a larger enterprise system, of course you'll want to follow your organization's experts and their guidance, but if you're "on your own", you can do worse than starting here.
+
+## Health Check Approaches
+
+In general the following commands or HTTP requests can serve as health checks for the various components of a Nautobot system.
+
+### Nautobot HTTP Server
+
+In addition to simply monitoring the existence of the `nautobot-server` process ID, two more in-depth approaches are possible here.
+
+An HTTP `GET` request to `<server>/health/` should return an HTTP `200 OK` response so long as:
+
+- the server is running
+- and the server can connect to the database
+- and all Django migrations have been applied (check added in Nautobot 2.2)
+- and the server can connect to Redis
+- and the server can write to an appropriate location on the filesystem
+- and the server is not too busy handling other requests to respond to this request.
+
+Similarly, but not identically, the CLI command `nautobot-server health_check` should run and return an exit code of `0` (success) so long as:
+
+- the command can connect to the database
+- and all Django migrations have been applied (check added in Nautobot 2.2)
+- and the command can connect to Redis
+- and the command can write to an appropriate location on the filesystem
+
+Note the differences between these two. In some situations you'll want to use both for different types of checks. More on this later in this document.
+
+### Nautobot Celery Worker
+
+In addition to monitoring the existence of a given Celery worker process ID, you can use the fact that Celery provides a [`celery inspect ping` CLI command](https://docs.celeryq.dev/en/stable/reference/cli.html#celery-inspect) that sends a short message to a given Celery worker(s) and reports back on whether it receives a response(s). Nautobot wraps this with the `nautobot-server` CLI command, so in general you can run `nautobot-server celery inspect ping --destination <worker name>` to confirm whether a given worker is able to receive and respond to Celery control messages.
+
+!!! tip
+    A Celery worker's name defaults to `celery@$HOSTNAME`, but you can override it by starting the worker with the `-n <name>` argument if needed.
+
+### Nautobot Celery Beat
+
+In addition to monitoring the Celery Beat process ID, you can use the fact that Nautobot's custom Celery Beat scheduler respects the [`CELERY_BEAT_HEARTBEAT_FILE`](../configuration/optional-settings.md#celery_beat_heartbeat_file) configuration setting, which specifies a filesystem path that will be repeatedly [`touch`ed](https://en.wikipedia.org/wiki/Touch_(command)) to update its last-modified timestamp so long as the scheduler is running. You can check this timestamp against the current system time to detect whether the Celery Beat scheduler is firing as expected. One way is using the `find` command with it's `-mmin` parameter, and checking whether it finds the expected file with a recent enough modification time (here, 0.1 minutes, or 6 seconds) or not:
+
+```shell
+[ $(find $NAUTOBOT_CELERY_BEAT_HEARTBEAT_FILE -mmin -0.1 | wc -l) -eq 1 ] || false
+```
+
+### Databases
+
+#### PostgreSQL
+
+PostgreSQL provides the [`pg_isready` CLI command](https://www.postgresql.org/docs/current/app-pg-isready.html) to check whether the database server is running and accepting connections.
+
+#### MySQL
+
+While MySQL provides the [`mysqladmin ping` CLI command](https://dev.mysql.com/doc/refman/8.0/en/mysqladmin.html), it's important to note that this command only checks whether the database server is running - it still exits with return code `0` if the server is running but not accepting connections. Therefore you might in some cases wish to run a command that actually connects to the database, such as `mysql --execute "SHOW DATABASES;"`.
+
+### Redis
+
+Redis provides the [`redis-cli ping` CLI command](https://redis.io/commands/ping/) for detecting whether the Redis server is alive. It will output `PONG` on success and exit with return code `0`. Note though that it may also exit with code `0` in cases where the server has started but is not yet ready to receive or serve data.
+
+!!! tip
+    If you have the Redis server configured to require a password, you will need to set the `REDISCLI_AUTH` environment variable to this password before `redis-cli ping` will be successful.
+
+## Deployments with systemd
+
+For systemd deployments, the underlying services of PostgreSQL/MySQL and Redis integrate natively with systemd's `sd_notify` API to provide additional status information to the system, and `uwsgi` does as well. We recommend following the standard deployment patterns provided by your OS for PostgreSQL/MySQL and Redis. For the Nautobot service and Celery/Beat services, follow the Nautobot installation documentation at [Setup systemd](services.md#setup-systemd).
+
+## Kubernetes Deployments
+
+Kubernetes (k8s) distinguishes between "startup", "readiness", and "liveness" probes. In brief:
+
+- Startup probes detect whether a container has finished starting up.
+- Readiness probes detect whether a container is ready to accept traffic.
+- Liveness probes detect whether a container needs to be restarted.
+
+For more details, refer to the [Kubernetes documentation](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/). A working example can be found in the [Nautobot Helm charts](https://github.com/nautobot/helm-charts) repository.
+
+### Nautobot Server Container in k8s
+
+For the Nautobot web server, you'll probably want to use `nautobot-server health_check` as a liveness probe (since it won't fail if the Nautobot server is too busy handling many HTTP requests, unlike the `/health/` endpoint) and use an HTTP request to `/health/` as a startup probe and a readiness probe (since it won't report success unless the Nautobot server is running and responding to HTTP requests). For example:
+
+```yaml
+startupProbe:
+  httpGet:
+    path: "/health/"
+    port: "http"
+  periodSeconds: 10
+  failureThreshold: 30
+
+readinessProbe:
+  httpGet:
+    path: "/health/"
+    port: "http"
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+
+livenessProbe:
+  exec:
+    command:
+      - "nautobot-server"
+      - "health_check"
+  periodSeconds: 10
+  timeoutSeconds: 10  # the CLI command takes a few seconds to start up
+  failureThreshold: 3
+```
+
+### Celery Worker Container in k8s
+
+The Celery worker container can use `nautobot-server celery inspect ping` for both liveness and readiness probes:
+
+```yaml
+readinessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "nautobot-server celery inspect ping --destination celery@$HOSTNAME"
+  periodSeconds: 60
+  timeoutSeconds: 10
+  failureThreshold: 3
+
+livenessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "nautobot-server celery inspect ping --destination celery@$HOSTNAME"
+  periodSeconds: 60
+  timeoutSeconds: 10
+  failureThreshold: 3
+```
+
+### Celery Beat Container in k8s
+
+The Celery Beat container doesn't need a readiness probe, but can benefit from a liveness probe:
+
+```yaml
+livenessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "[ $(find $NAUTOBOT_CELERY_BEAT_HEARTBEAT_FILE -mmin -0.1 | wc -l) -eq 1 ] || false"
+  initialDelaySeconds: 30
+  periodSeconds: 5
+  timeoutSeconds: 5
+  failureThreshold: 3
+```
+
+### Redis Container in k8s
+
+```yaml
+readinessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "redis-cli -h localhost ping | grep PONG"
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+
+livenessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "redis-cli -h localhost ping"
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+```
+
+### PostgreSQL Container in k8s
+
+```yaml
+readinessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "pg_isready -d $POSTGRES_DB -U $POSTGRES_USER"
+  initialDelaySeconds: 30
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+
+livenessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "pg_isready -d $POSTGRES_DB -U $POSTGRES_USER"
+  initialDelaySeconds: 30
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+```
+
+### MySQL Container in k8s
+
+```yaml
+readinessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - 'mysql -u $MYSQL_USER --password=$MYSQL_PASSWORD -h localhost --execute "SHOW DATABASES;"'
+  initialDelaySeconds: 30
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+
+livenessProbe:
+  exec:
+    command:
+      - "/bin/bash"
+      - "-c"
+      - "mysqladmin -u $MYSQL_USER --password=$MYSQL_PASSWORD -h localhost ping"
+  initialDelaySeconds: 30
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 3
+```
+
+## Docker Compose Deployments
+
+Docker Compose supports a single `healthcheck` for each container.
+
+### Nautobot Server Container in Docker Compose
+
+We recommend the CLI-based health-check rather than the HTTP health-check here because the former will not fail when all request-handler workers are busy.
+
+```yaml
+healthcheck:
+  interval: 10s
+  timeout: 10s
+  start_period: 5m  # in Nautobot 2.2 and later, this won't report success until all migrations have run
+  retries: 3
+  test: "nautobot-server health_check"
+```
+
+### Celery Worker Container in Docker Compose
+
+```yaml
+healthcheck:
+  interval: 60s
+  timeout: 10s
+  start_period: 30s
+  retries: 3
+  test: "nautobot-server celery inspect ping --destination celery@$$HOSTNAME"
+```
+
+### Celery Beat Container in Docker Compose
+
+```yaml
+healthcheck:
+  interval: 5s
+  timeout: 5s
+  start_period: 30s
+  retries: 3
+  test: '[ "$$(find /tmp/nautobot_celery_beat_heartbeat -mmin -0.1 | wc -l)" != "" ] || false'
+```
+
+### Redis Container in Docker Compose
+
+```yaml
+healthcheck:
+  interval: 10s
+  timeout: 5s
+  retries: 3
+  test: "redis-cli -h localhost ping | grep PONG"
+```
+
+!!! tip
+    If you have the Redis server configured to require a password, you will need to set the `REDISCLI_AUTH` environment variable to this password before `redis-cli ping` will be successful.
+
+### PostgreSQL Container in Docker Compose
+
+```yaml
+healthcheck:
+  interval: 10s
+  timeout: 5s
+  start_period: 30s
+  retries: 3
+  test: "pg_isready -d $$POSTGRES_DB -U $$POSTGRES_USER"
+```
+
+### MySQL Container in Docker Compose
+
+```yaml
+healthcheck:
+  interval: 10s
+  timeout: 5s
+  start_period: 30s
+  retries: 3
+  test: 'mysql -h localhost -u $$MYSQL_USER --password=$$MYSQL_PASSWORD --execute "SHOW DATABASES;"'
+```