diff --git a/.env.example b/.env.example index f798bd7f..5fd724fc 100644 --- a/.env.example +++ b/.env.example @@ -34,5 +34,13 @@ ENV_DOCKER_USER_GROUP="ggroup" ENV_PING_USERNAME= ENV_PING_PASSWORD= +# --- HTTP Server +ENV_HTTP_PORT=8080 + # --- SEO: SPA application directory ENV_SPA_DIR= +ENV_SPA_IMAGES_DIR= + +# --- Monitoring: Grafana admin password +# REQUIRED for Grafana dashboard access +GRAFANA_ADMIN_PASSWORD= diff --git a/.gitignore b/.gitignore index 17ac5765..c1d9c088 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,8 @@ storage/seo/*.* !storage/seo/.gitkeep # --- [Caddy]: mtls -caddy/mtls/*.* -!caddy/mtls/.gitkeep +infra/caddy/mtls/*.* +!infra/caddy/mtls/.gitkeep # --- [API]: Bin bin/* diff --git a/Makefile b/Makefile index 70b54ce6..694ddc59 100644 --- a/Makefile +++ b/Makefile @@ -34,14 +34,15 @@ CGO_ENABLED := 1 # -------------------------------------------------------------------------------------------------------------------- # # -------------------------------------------------------------------------------------------------------------------- # -include ./metal/makefile/helpers.mk -include ./metal/makefile/env.mk -include ./metal/makefile/db.mk -include ./metal/makefile/app.mk -include ./metal/makefile/logs.mk -include ./metal/makefile/build.mk -include ./metal/makefile/infra.mk -include ./metal/makefile/caddy.mk +include ./infra/makefile/helpers.mk +include ./infra/makefile/env.mk +include ./infra/makefile/db.mk +include ./infra/makefile/app.mk +include ./infra/makefile/logs.mk +include ./infra/makefile/build.mk +include ./infra/makefile/infra.mk +include ./infra/makefile/caddy.mk +include ./infra/makefile/monitor.mk # -------------------------------------------------------------------------------------------------------------------- # # -------------------------------------------------------------------------------------------------------------------- # @@ -104,6 +105,14 @@ help: @printf "$(BOLD)$(BLUE)Caddy Commands:$(NC)\n" @printf " $(BOLD)$(GREEN)caddy-gen-cert$(NC) : Generate the caddy's mtls certificates.\n" @printf " $(BOLD)$(GREEN)caddy-del-cert$(NC) : Remove the caddy's mtls certificates.\n" - @printf " $(BOLD)$(GREEN)caddy-validate$(NC) : Validates caddy's files syntax.\n" + @printf " $(BOLD)$(GREEN)caddy-validate$(NC) : Validates caddy's files syntax.\n\n" + + @printf "$(BOLD)$(BLUE)Monitoring Commands:$(NC)\n" + @printf " $(BOLD)$(GREEN)monitor-up$(NC) : Start the monitoring stack (Prometheus, Grafana).\n" + @printf " $(BOLD)$(GREEN)monitor-down$(NC) : Stop the monitoring stack.\n" + @printf " $(BOLD)$(GREEN)monitor-status$(NC) : Show status of monitoring services.\n" + @printf " $(BOLD)$(GREEN)monitor-test$(NC) : Run monitoring stack test suite.\n" + @printf " $(BOLD)$(GREEN)monitor-grafana$(NC) : Open Grafana dashboards in browser.\n" + @printf " $(BOLD)$(GREEN)monitor-help$(NC) : Show detailed monitoring commands.\n" @printf "$(NC)\n" diff --git a/docker-compose.yml b/docker-compose.yml index 513b806a..a23ffc80 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,14 @@ volumes: caddy_config: go_mod_cache: driver: local + prometheus_data_prod: + driver: local + prometheus_data_local: + driver: local + grafana_data_prod: + driver: local + grafana_data_local: + driver: local # --- DB: Define a named volume at the top level. # Docker will manage its lifecycle. @@ -30,7 +38,7 @@ services: caddy_prod: image: api-caddy_prod build: - context: ./caddy + context: ./infra/caddy dockerfile: Dockerfile args: - CADDY_VERSION=2.10.2 @@ -40,16 +48,27 @@ services: restart: unless-stopped depends_on: - api + + # --- The 443:443/udp is required for HTTP/3 + # NOTES: + # - Admin API (2019) listens on all interfaces but is NOT published to host + # - Prometheus scrapes metrics from dedicated endpoint (9180) via Docker internal DNS ports: - "80:80" - "443:443" - "443:443/udp" # Required for HTTP/3 + # NOTE: Admin API (2019) is NOT published to host (internal Docker network only) + # Prometheus scrapes Caddy metrics from :9180 via Docker internal DNS + + # --- Dedicated /metrics endpoint for Prometheus (internal network only) + expose: + - "9180" volumes: - caddy_data:/data - caddy_config:/config - - ./caddy/Caddyfile.prod:/etc/caddy/Caddyfile + - ./infra/caddy/Caddyfile.prod:/etc/caddy/Caddyfile - ${CADDY_LOGS_PATH}:/var/log/caddy - - ./caddy/mtls:/etc/caddy/mtls:ro + - ./infra/caddy/mtls:/etc/caddy/mtls:ro networks: caddy_net: aliases: @@ -57,7 +76,7 @@ services: caddy_local: build: - context: ./caddy + context: ./infra/caddy dockerfile: Dockerfile args: - CADDY_VERSION=2.10.2 @@ -68,15 +87,247 @@ services: depends_on: - api ports: - - "8080:80" + - "18080:80" - "8443:443" + - "127.0.0.1:2019:2019" # Admin API - localhost only for debugging + + # --- Dedicated /metrics endpoint for Prometheus (internal network only) + expose: + - "9180" + volumes: - caddy_data:/data - caddy_config:/config - - ./caddy/mtls:/etc/caddy/mtls:ro - - ./caddy/Caddyfile.local:/etc/caddy/Caddyfile + - ./infra/caddy/mtls:/etc/caddy/mtls:ro + - ./infra/caddy/Caddyfile.local:/etc/caddy/Caddyfile + networks: + - caddy_net + + prometheus: + image: prom/prometheus:v3.0.1 + profiles: ["prod"] + container_name: oullin_prometheus + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - "127.0.0.1:9090:9090" + volumes: + - ./infra/metrics/prometheus/provisioning/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data_prod:/prometheus + networks: + - caddy_net + - oullin_net + depends_on: + caddy_prod: + condition: service_started + postgres_exporter: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + + prometheus_local: + image: prom/prometheus:v3.0.1 + profiles: ["local"] + container_name: oullin_prometheus_local + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=7d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - "9090:9090" + volumes: + - ./infra/metrics/prometheus/provisioning/prometheus.local.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data_local:/prometheus + networks: + - caddy_net + - oullin_net + depends_on: + caddy_local: + condition: service_started + postgres_exporter_local: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + + postgres_exporter: + image: prometheuscommunity/postgres-exporter:v0.15.0 + profiles: ["prod"] + container_name: oullin_postgres_exporter + restart: unless-stopped + entrypoint: ["/postgres-exporter-entrypoint.sh"] + volumes: + - ./infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh:/postgres-exporter-entrypoint.sh:ro + secrets: + - pg_username + - pg_password + - pg_dbname + networks: + - oullin_net + - caddy_net + depends_on: + api-db: + condition: service_healthy + expose: + - "9187" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9187/"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: '0.25' + memory: 128M + reservations: + cpus: '0.05' + memory: 32M + + postgres_exporter_local: + image: prometheuscommunity/postgres-exporter:v0.15.0 + profiles: ["local"] + container_name: oullin_postgres_exporter_local + restart: unless-stopped + entrypoint: ["/postgres-exporter-entrypoint.sh"] + volumes: + - ./infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh:/postgres-exporter-entrypoint.sh:ro + secrets: + - pg_username + - pg_password + - pg_dbname + networks: + - oullin_net + - caddy_net + depends_on: + api-db: + condition: service_healthy + expose: + - "9187" + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9187/"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + deploy: + resources: + limits: + cpus: '0.25' + memory: 128M + reservations: + cpus: '0.05' + memory: 32M + + grafana: + image: grafana/grafana:11.4.0 + profiles: ["prod"] + container_name: oullin_grafana + restart: unless-stopped + ports: + - "127.0.0.1:3000:3000" + environment: + - GF_SERVER_ROOT_URL=http://localhost:3000 + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD must be set in .env file} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_INSTALL_PLUGINS= + - GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus:9090 + volumes: + - grafana_data_prod:/var/lib/grafana + - ./infra/metrics/grafana/provisioning:/etc/grafana/provisioning:ro + - ./infra/metrics/grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - caddy_net + depends_on: + prometheus: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.1' + memory: 128M + + grafana_local: + image: grafana/grafana:11.4.0 + profiles: ["local"] + container_name: oullin_grafana_local + restart: unless-stopped + ports: + - "3000:3000" + environment: + - GF_SERVER_ROOT_URL=http://localhost:3000 + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD must be set in .env file} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=false + - GF_INSTALL_PLUGINS= + - GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus_local:9090 + volumes: + - grafana_data_local:/var/lib/grafana + - ./infra/metrics/grafana/provisioning:/etc/grafana/provisioning:ro + - ./infra/metrics/grafana/dashboards:/var/lib/grafana/dashboards:ro networks: - caddy_net + depends_on: + prometheus_local: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.1' + memory: 128M # A dedicated service for running one-off Go commands api-runner: @@ -86,7 +337,7 @@ services: - ./.env build: context: . - dockerfile: ./docker/dockerfile-api + dockerfile: ./infra/docker/dockerfile-api target: builder volumes: - .:/app @@ -128,7 +379,7 @@ services: ENV_HTTP_HOST: 0.0.0.0 build: context: . - dockerfile: ./docker/dockerfile-api + dockerfile: ./infra/docker/dockerfile-api args: - APP_VERSION=0.0.0.1 - APP_HOST_PORT=${ENV_HTTP_PORT} diff --git a/go.mod b/go.mod index 642b8e8b..837d0b83 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/joho/godotenv v1.5.1 github.com/klauspost/compress v1.18.0 github.com/lib/pq v1.10.9 + github.com/prometheus/client_golang v1.20.5 github.com/rs/cors v1.11.1 github.com/testcontainers/testcontainers-go v0.39.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.39.0 @@ -29,7 +30,9 @@ require ( dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -66,11 +69,15 @@ require ( github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/morikuni/aec v1.0.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/shirou/gopsutil/v4 v4.25.9 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/stretchr/testify v1.11.1 // indirect diff --git a/go.sum b/go.sum index 81962bc9..f2d5a7aa 100644 --- a/go.sum +++ b/go.sum @@ -10,8 +10,12 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/webp v1.4.0 h1:6DA2pkkRUPnbOHvvsmGI3He1hBKf/bkRlniAiSGuEko= github.com/chai2010/webp v1.4.0/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -92,6 +96,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= @@ -120,6 +126,8 @@ github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= @@ -132,6 +140,14 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA= @@ -185,8 +201,6 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= -golang.org/toolchain v0.0.1-go1.25.3.linux-amd64 h1:OsvRiFtt0A9JsTaoQsnFK4wKOOAY2UtJvkOT+Djl7tQ= -golang.org/toolchain v0.0.1-go1.25.3.linux-amd64/go.mod h1:c/4eKWFBYMD/i1j7ipNwtrHQP02jj74611NzmDqwkJE= golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ= golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= diff --git a/handler/metrics.go b/handler/metrics.go new file mode 100644 index 00000000..9cbcbcdd --- /dev/null +++ b/handler/metrics.go @@ -0,0 +1,23 @@ +package handler + +import ( + "net/http" + + "github.com/oullin/pkg/endpoint" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +type MetricsHandler struct{} + +func NewMetricsHandler() MetricsHandler { + return MetricsHandler{} +} + +// Handle returns the Prometheus metrics handler +// Protected by Docker network isolation - only accessible from containers +// within caddy_net and oullin_net networks (not exposed to host) +func (h MetricsHandler) Handle(w http.ResponseWriter, r *http.Request) *endpoint.ApiError { + // Serve Prometheus metrics using the standard promhttp handler + promhttp.Handler().ServeHTTP(w, r) + return nil +} diff --git a/caddy/Caddyfile.local b/infra/caddy/Caddyfile.local similarity index 74% rename from caddy/Caddyfile.local rename to infra/caddy/Caddyfile.local index d1c84dbc..57e3819c 100644 --- a/caddy/Caddyfile.local +++ b/infra/caddy/Caddyfile.local @@ -2,6 +2,15 @@ # This is the most reliable way to ensure Caddy acts as a simple HTTP proxy locally. { auto_https off + + # Enable metrics collection for HTTP handlers + servers { + metrics + } + + # Admin API listens only on localhost within container for security + # Prometheus accesses /metrics via the dedicated :9180 listener, not the admin API + admin 127.0.0.1:2019 } # It tells Caddy to listen on its internal port 80 for any incoming hostname. @@ -35,8 +44,27 @@ respond 204 } + # Block protected paths + @protected path /metrics /generate-signature* + handle @protected { + respond 403 + } + # Reverse proxy all incoming requests to the 'api' service. # - The service name 'api' is resolved by Docker's internal DNS to the correct container IP on the 'caddy_net' network. # - The API container listens on port 8080 (from the ENV_HTTP_PORT). reverse_proxy api:8080 } + +# INTERNAL metrics endpoint for Prometheus scraping +# This exposes ONLY /metrics, not the full admin API +# Listens on all interfaces but not published to host (Docker network only) +:9180 { + handle /metrics { + reverse_proxy localhost:2019 + } + + handle { + respond 404 + } +} diff --git a/caddy/Caddyfile.prod b/infra/caddy/Caddyfile.prod similarity index 82% rename from caddy/Caddyfile.prod rename to infra/caddy/Caddyfile.prod index 16f46287..b71aca96 100644 --- a/caddy/Caddyfile.prod +++ b/infra/caddy/Caddyfile.prod @@ -1,3 +1,15 @@ +# Global options: Enable the admin API and metrics +{ + # Enable metrics collection for HTTP handlers + servers { + metrics + } + + # Admin API listens only on localhost within container for security + # Prometheus accesses /metrics via the dedicated :9180 listener, not the admin API + admin 127.0.0.1:2019 +} + # Caddy will automatically provision a Let's Encrypt certificate. gocanto.dev, www.gocanto.dev { log { @@ -29,8 +41,8 @@ oullin.io { format json } - # --- Public listener: block protected path - @protected_public path /api/generate-signature* + # --- Public listener: block protected paths + @protected_public path /api/generate-signature* /api/metrics handle @protected_public { respond 403 } @@ -117,3 +129,16 @@ oullin.io { respond 403 } } + +# INTERNAL metrics endpoint for Prometheus scraping +# This exposes ONLY /metrics, not the full admin API +# Listens on all interfaces but not published to host (Docker network only) +:9180 { + handle /metrics { + reverse_proxy localhost:2019 + } + + handle { + respond 404 + } +} diff --git a/caddy/Dockerfile b/infra/caddy/Dockerfile similarity index 93% rename from caddy/Dockerfile rename to infra/caddy/Dockerfile index 6de850e3..69757ec0 100644 --- a/caddy/Dockerfile +++ b/infra/caddy/Dockerfile @@ -1,4 +1,4 @@ -# Filename: caddy/Dockerfile +# Filename: infra/caddy/Dockerfile # This Dockerfile builds a Caddy image using a specific, stable version number. # Define a build argument for the Caddy version with a sensible default. diff --git a/caddy/mtls/.gitkeep b/infra/caddy/mtls/.gitkeep similarity index 100% rename from caddy/mtls/.gitkeep rename to infra/caddy/mtls/.gitkeep diff --git a/caddy/readme.md b/infra/caddy/readme.md similarity index 100% rename from caddy/readme.md rename to infra/caddy/readme.md diff --git a/docker/dockerfile-api b/infra/docker/dockerfile-api similarity index 100% rename from docker/dockerfile-api rename to infra/docker/dockerfile-api diff --git a/metal/makefile/app.mk b/infra/makefile/app.mk similarity index 66% rename from metal/makefile/app.mk rename to infra/makefile/app.mk index b29af9bb..ad08b22a 100644 --- a/metal/makefile/app.mk +++ b/infra/makefile/app.mk @@ -1,12 +1,44 @@ -.PHONY: fresh destroy audit watch format run-cli test-all run-cli-docker run-metal +# -------------------------------------------------------------------------------------------------------------------- # +# Application Management Targets +# -------------------------------------------------------------------------------------------------------------------- # -DB_SECRET_USERNAME ?= ./database/infra/secrets/pg_username -DB_SECRET_PASSWORD ?= ./database/infra/secrets/pg_password -DB_SECRET_DBNAME ?= ./database/infra/secrets/pg_dbname +# -------------------------------------------------------------------------------------------------------------------- # +# Configuration Variables +# -------------------------------------------------------------------------------------------------------------------- # + +ROOT_PATH := $(shell pwd) +DB_SECRETS_DIR := $(ROOT_PATH)/database/infra/secrets + +DB_SECRET_USERNAME ?= $(DB_SECRETS_DIR)/pg_username +DB_SECRET_PASSWORD ?= $(DB_SECRETS_DIR)/pg_password +DB_SECRET_DBNAME ?= $(DB_SECRETS_DIR)/pg_dbname + +# -------------------------------------------------------------------------------------------------------------------- # +# PHONY Targets +# -------------------------------------------------------------------------------------------------------------------- # + +.PHONY: fresh destroy audit watch format run-cli test-all run-cli-docker run-metal install-air + +# -------------------------------------------------------------------------------------------------------------------- # +# Code Quality Commands +# -------------------------------------------------------------------------------------------------------------------- # format: gofmt -w -s . +audit: + $(call external_deps,'.') + $(call external_deps,'./app/...') + $(call external_deps,'./database/...') + $(call external_deps,'./docs/...') + +test-all: + go test ./... + +# -------------------------------------------------------------------------------------------------------------------- # +# Docker Management Commands +# -------------------------------------------------------------------------------------------------------------------- # + fresh: docker compose down --volumes --rmi all --remove-orphans docker ps @@ -22,11 +54,9 @@ destroy: docker ps -aq | xargs --no-run-if-empty docker rm && \ docker ps -audit: - $(call external_deps,'.') - $(call external_deps,'./app/...') - $(call external_deps,'./database/...') - $(call external_deps,'./docs/...') +# -------------------------------------------------------------------------------------------------------------------- # +# Development Tools +# -------------------------------------------------------------------------------------------------------------------- # watch: # --- Works with (air). @@ -39,6 +69,10 @@ install-air: @echo "Installing air ..." @go install github.com/air-verse/air@latest +# -------------------------------------------------------------------------------------------------------------------- # +# CLI Commands +# -------------------------------------------------------------------------------------------------------------------- # + run-cli: @missing_values=""; \ missing_files=""; \ @@ -115,11 +149,9 @@ run-cli: printf "\n$(RED)❌ CLI exited with status $$status.$(NC)\n"; \ exit $$status; \ fi + run-cli-docker: make run-cli DB_SECRET_USERNAME=$(DB_SECRET_USERNAME) DB_SECRET_PASSWORD=$(DB_SECRET_PASSWORD) DB_SECRET_DBNAME=$(DB_SECRET_DBNAME) -test-all: - go test ./... - run-metal: go run metal/cli/main.go diff --git a/metal/makefile/build.mk b/infra/makefile/build.mk similarity index 100% rename from metal/makefile/build.mk rename to infra/makefile/build.mk diff --git a/metal/makefile/caddy.mk b/infra/makefile/caddy.mk similarity index 91% rename from metal/makefile/caddy.mk rename to infra/makefile/caddy.mk index c2f6e748..8e71e1c0 100644 --- a/metal/makefile/caddy.mk +++ b/infra/makefile/caddy.mk @@ -1,8 +1,8 @@ .PHONY: caddy-gen-certs caddy-del-certs caddy-validate caddy-fresh caddy-restart -CADDY_MTLS_DIR = $(ROOT_PATH)/caddy/mtls -APP_CADDY_CONFIG_PROD_FILE ?= caddy/Caddyfile.prod -APP_CADDY_CONFIG_LOCAL_FILE ?= caddy/Caddyfile.local +CADDY_MTLS_DIR = $(ROOT_PATH)/infra/caddy/mtls +APP_CADDY_CONFIG_PROD_FILE ?= infra/caddy/Caddyfile.prod +APP_CADDY_CONFIG_LOCAL_FILE ?= infra/caddy/Caddyfile.local caddy-restart: docker compose up -d --force-recreate caddy_prod @@ -66,6 +66,6 @@ caddy-del-certs: caddy-validate: @docker run --rm \ - -v "$(ROOT_PATH)/caddy/Caddyfile.prod:/etc/caddy/Caddyfile:ro" \ - -v "$(ROOT_PATH)/caddy/mtls:/etc/caddy/mtls:ro" \ + -v "$(ROOT_PATH)/infra/caddy/Caddyfile.prod:/etc/caddy/Caddyfile:ro" \ + -v "$(ROOT_PATH)/infra/caddy/mtls:/etc/caddy/mtls:ro" \ caddy:2.10.0 caddy validate --config /etc/caddy/Caddyfile diff --git a/metal/makefile/db.mk b/infra/makefile/db.mk similarity index 100% rename from metal/makefile/db.mk rename to infra/makefile/db.mk diff --git a/metal/makefile/env.mk b/infra/makefile/env.mk similarity index 100% rename from metal/makefile/env.mk rename to infra/makefile/env.mk diff --git a/metal/makefile/helpers.mk b/infra/makefile/helpers.mk similarity index 100% rename from metal/makefile/helpers.mk rename to infra/makefile/helpers.mk diff --git a/metal/makefile/infra.mk b/infra/makefile/infra.mk similarity index 100% rename from metal/makefile/infra.mk rename to infra/makefile/infra.mk diff --git a/metal/makefile/logs.mk b/infra/makefile/logs.mk similarity index 100% rename from metal/makefile/logs.mk rename to infra/makefile/logs.mk diff --git a/infra/makefile/monitor.mk b/infra/makefile/monitor.mk new file mode 100644 index 00000000..2288b911 --- /dev/null +++ b/infra/makefile/monitor.mk @@ -0,0 +1,556 @@ +# -------------------------------------------------------------------------------------------------------------------- # +# Monitoring Stack Targets +# -------------------------------------------------------------------------------------------------------------------- # + +# -------------------------------------------------------------------------------------------------------------------- # +# Configuration Variables +# -------------------------------------------------------------------------------------------------------------------- # + +ROOT_PATH := $(shell pwd) +MONITORING_DIR := $(ROOT_PATH)/infra/metrics +BACKUPS_DIR := $(ROOT_PATH)/storage/monitoring/backups + +# -------------------------------------------------------------------------------------------------------------------- # +# Volume Labels (defined in docker-compose.yml) +# -------------------------------------------------------------------------------------------------------------------- # + +PROMETHEUS_VOLUME_LOCAL := prometheus_data_local +PROMETHEUS_VOLUME_PROD := prometheus_data_prod +GRAFANA_VOLUME_LOCAL := grafana_data_local +GRAFANA_VOLUME_PROD := grafana_data_prod + +# Docker service names (defined in docker-compose.yml) +PROMETHEUS_SERVICE_LOCAL := prometheus_local +PROMETHEUS_SERVICE_PROD := prometheus +GRAFANA_SERVICE_LOCAL := grafana_local +GRAFANA_SERVICE_PROD := grafana +POSTGRES_EXPORTER_SERVICE_LOCAL := postgres_exporter_local +POSTGRES_EXPORTER_SERVICE_PROD := postgres_exporter + +# Monitoring service URLs and ports +GRAFANA_HOST := localhost +GRAFANA_PORT := 3000 +GRAFANA_URL := http://$(GRAFANA_HOST):$(GRAFANA_PORT) + +PROMETHEUS_HOST := localhost +PROMETHEUS_PORT := 9090 +PROMETHEUS_URL := http://$(PROMETHEUS_HOST):$(PROMETHEUS_PORT) + +CADDY_ADMIN_HOST := localhost +CADDY_ADMIN_PORT := 2019 +CADDY_ADMIN_URL := http://$(CADDY_ADMIN_HOST):$(CADDY_ADMIN_PORT) + +API_HOST := localhost +API_PORT := 18080 +API_URL := http://$(API_HOST):$(API_PORT) +PING_USERNAME ?= $(ENV_PING_USERNAME) +PING_PASSWORD ?= $(ENV_PING_PASSWORD) +PING_AUTH_FLAG := $(if $(and $(PING_USERNAME),$(PING_PASSWORD)),-u $(PING_USERNAME):$(PING_PASSWORD),) + +# Production API endpoint (behind Caddy) +API_PROD_HOST := localhost +API_PROD_URL := http://$(API_PROD_HOST) + +# Internal service URLs (Docker network) +PG_EXPORTER_HOST := postgres_exporter_local +PG_EXPORTER_PORT := 9187 +PG_EXPORTER_URL := http://$(PG_EXPORTER_HOST):$(PG_EXPORTER_PORT) + +# -------------------------------------------------------------------------------------------------------------------- # +# PHONY Targets +# -------------------------------------------------------------------------------------------------------------------- # + +.PHONY: monitor-up monitor-up-prod monitor-down monitor-down-prod monitor-restart monitor-restart-prod \ + monitor-up-full monitor-up-full-prod monitor-up-logs monitor-up-logs-prod monitor-down-remove monitor-down-remove-prod \ + monitor-pull monitor-pull-prod monitor-docker-config monitor-docker-config-prod monitor-docker-exec-prometheus monitor-docker-exec-prometheus-prod \ + monitor-docker-exec-grafana monitor-docker-exec-grafana-prod monitor-docker-ps monitor-docker-inspect monitor-docker-inspect-prod \ + monitor-docker-logs-prometheus monitor-docker-logs-prometheus-prod monitor-docker-logs-grafana monitor-docker-logs-grafana-prod monitor-docker-logs-db monitor-docker-logs-db-prod \ + monitor-status monitor-logs monitor-logs-prod \ + monitor-test monitor-targets monitor-config monitor-config-prod monitor-grafana monitor-prometheus \ + monitor-caddy-metrics monitor-api-metrics monitor-db-metrics monitor-db-metrics-prod monitor-metrics \ + monitor-traffic monitor-traffic-heavy monitor-traffic-prod monitor-traffic-heavy-prod \ + monitor-clean monitor-clean-prod monitor-stats monitor-stats-prod monitor-backup monitor-backup-prod monitor-export-dashboards monitor-help \ + monitor-volumes-local-check monitor-volumes-prod-check + +# -------------------------------------------------------------------------------------------------------------------- # +# Start/Stop Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Start monitoring stack (local development) +monitor-up: + @printf "$(BOLD)$(CYAN)Starting monitoring stack (local)...$(NC)\n" + @docker compose --profile local up -d $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + @sleep 3 + @printf "$(BOLD)$(GREEN)✓ Monitoring stack started$(NC)\n" + @printf "\n$(BOLD)Access points:$(NC)\n" + @printf " $(GREEN)Grafana:$(NC) $(GRAFANA_URL)\n" + @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)\n" + @printf " $(GREEN)Caddy Admin:$(NC) $(CADDY_ADMIN_URL)\n\n" + +## Start monitoring stack (production) +monitor-up-prod: + @printf "$(BOLD)$(CYAN)Starting monitoring stack (production)...$(NC)\n" + @docker compose --profile prod up -d $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + @sleep 3 + @printf "$(BOLD)$(GREEN)✓ Monitoring stack started$(NC)\n" + @printf "\n$(BOLD)Access points (from server):$(NC)\n" + @printf " $(GREEN)Grafana:$(NC) $(GRAFANA_URL)\n" + @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)\n" + @printf " $(GREEN)Caddy Admin:$(NC) $(CADDY_ADMIN_URL)\n\n" + +## Stop monitoring stack (local) +monitor-down: + @printf "$(BOLD)$(CYAN)Stopping monitoring stack (local)...$(NC)\n" + @docker compose --profile local stop $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + @printf "$(BOLD)$(GREEN)✓ Monitoring stack stopped$(NC)\n\n" + +## Stop monitoring stack (production) +monitor-down-prod: + @printf "$(BOLD)$(CYAN)Stopping monitoring stack (production)...$(NC)\n" + @docker compose --profile prod stop $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + @printf "$(BOLD)$(GREEN)✓ Monitoring stack stopped$(NC)\n\n" + +## Restart monitoring stack (local) +monitor-restart: + @printf "$(BOLD)$(CYAN)Restarting monitoring stack (local)...$(NC)\n" + @docker compose --profile local restart $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + @printf "$(BOLD)$(GREEN)✓ Monitoring stack restarted$(NC)\n\n" + +## Restart monitoring stack (production) +monitor-restart-prod: + @printf "$(BOLD)$(CYAN)Restarting monitoring stack (production)...$(NC)\n" + @docker compose --profile prod restart $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + @printf "$(BOLD)$(GREEN)✓ Monitoring stack restarted$(NC)\n\n" + +# -------------------------------------------------------------------------------------------------------------------- # +# Docker Compose Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Start monitoring with full stack (API + DB + monitoring) - local +monitor-up-full: + @printf "$(BOLD)$(CYAN)Starting full stack with monitoring (local)...$(NC)\n" + @docker compose --profile local up -d + @sleep 3 + @printf "$(BOLD)$(GREEN)✓ Full stack started$(NC)\n\n" + +## Start monitoring with full stack (API + DB + monitoring) - production +monitor-up-full-prod: + @printf "$(BOLD)$(CYAN)Starting full stack with monitoring (production)...$(NC)\n" + @docker compose --profile prod up -d + @sleep 3 + @printf "$(BOLD)$(GREEN)✓ Full stack started$(NC)\n\n" + +## Start monitoring stack with logs (foreground) - local +monitor-up-logs: + @printf "$(BOLD)$(CYAN)Starting monitoring stack with logs (local)...$(NC)\n" + @docker compose --profile local up $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + +## Start monitoring stack with logs (foreground) - production +monitor-up-logs-prod: + @printf "$(BOLD)$(CYAN)Starting monitoring stack with logs (production)...$(NC)\n" + @docker compose --profile prod up $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + +## Stop and remove monitoring containers - local +monitor-down-remove: + @printf "$(BOLD)$(CYAN)Stopping and removing monitoring containers (local)...$(NC)\n" + @docker compose --profile local down $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + @printf "$(BOLD)$(GREEN)✓ Containers stopped and removed$(NC)\n\n" + +## Stop and remove monitoring containers - production +monitor-down-remove-prod: + @printf "$(BOLD)$(CYAN)Stopping and removing monitoring containers (production)...$(NC)\n" + @docker compose --profile prod down $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + @printf "$(BOLD)$(GREEN)✓ Containers stopped and removed$(NC)\n\n" + +## Pull latest monitoring images (local) +monitor-pull: + @printf "$(BOLD)$(CYAN)Pulling latest monitoring images (local)...$(NC)\n" + @docker compose pull $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + @printf "$(BOLD)$(GREEN)✓ Images pulled$(NC)\n\n" + +## Pull latest monitoring images (production) +monitor-pull-prod: + @printf "$(BOLD)$(CYAN)Pulling latest monitoring images (production)...$(NC)\n" + @docker compose pull $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + @printf "$(BOLD)$(GREEN)✓ Images pulled$(NC)\n\n" + +## Show docker compose config for monitoring services (local) +monitor-docker-config: + @printf "$(BOLD)$(CYAN)Docker Compose Configuration (monitoring - local)$(NC)\n\n" + @docker compose config --profile local | grep -A 20 "$(PROMETHEUS_SERVICE_LOCAL)\|$(GRAFANA_SERVICE_LOCAL)\|$(POSTGRES_EXPORTER_SERVICE_LOCAL)" || docker compose config --profile local + +## Show docker compose config for monitoring services (production) +monitor-docker-config-prod: + @printf "$(BOLD)$(CYAN)Docker Compose Configuration (monitoring - production)$(NC)\n\n" + @docker compose config --profile prod | grep -A 20 "$(PROMETHEUS_SERVICE_PROD)\|$(GRAFANA_SERVICE_PROD)\|$(POSTGRES_EXPORTER_SERVICE_PROD)" || docker compose config --profile prod + +## Execute command in Prometheus container (local) +monitor-docker-exec-prometheus: + @printf "$(BOLD)$(CYAN)Executing shell in Prometheus container (local)...$(NC)\n" + @docker exec -it oullin_prometheus_local /bin/sh + +## Execute command in Prometheus container (production) +monitor-docker-exec-prometheus-prod: + @printf "$(BOLD)$(CYAN)Executing shell in Prometheus container (production)...$(NC)\n" + @docker exec -it oullin_prometheus /bin/sh + +## Execute command in Grafana container (local) +monitor-docker-exec-grafana: + @printf "$(BOLD)$(CYAN)Executing shell in Grafana container (local)...$(NC)\n" + @docker exec -it oullin_grafana_local /bin/sh + +## Execute command in Grafana container (production) +monitor-docker-exec-grafana-prod: + @printf "$(BOLD)$(CYAN)Executing shell in Grafana container (production)...$(NC)\n" + @docker exec -it oullin_grafana /bin/sh + +## Show docker ps for monitoring containers +monitor-docker-ps: + @printf "$(BOLD)$(CYAN)Monitoring Containers$(NC)\n\n" + @docker ps --filter "name=prometheus" --filter "name=grafana" --filter "name=exporter" --format "table {{.ID}}\t{{.Names}}\t{{.Status}}\t{{.Ports}}" + @printf "\n" + +## Show docker inspect for monitoring containers (local) +monitor-docker-inspect: + @printf "$(BOLD)$(CYAN)Inspecting Monitoring Containers (local)$(NC)\n\n" + @docker inspect oullin_prometheus_local oullin_grafana_local oullin_postgres_exporter_local 2>/dev/null | jq '.[].Name, .[].State, .[].NetworkSettings.Networks' || echo "$(RED)Containers not running$(NC)" + +## Show docker inspect for monitoring containers (production) +monitor-docker-inspect-prod: + @printf "$(BOLD)$(CYAN)Inspecting Monitoring Containers (production)$(NC)\n\n" + @docker inspect oullin_prometheus oullin_grafana oullin_postgres_exporter 2>/dev/null | jq '.[].Name, .[].State, .[].NetworkSettings.Networks' || echo "$(RED)Containers not running$(NC)" + +## View monitoring container logs (docker logs - local) +monitor-docker-logs-prometheus: + @docker logs -f oullin_prometheus_local + +monitor-docker-logs-grafana: + @docker logs -f oullin_grafana_local + +monitor-docker-logs-db: + @docker logs -f oullin_postgres_exporter_local + +## View monitoring container logs (docker logs - production) +monitor-docker-logs-prometheus-prod: + @docker logs -f oullin_prometheus + +monitor-docker-logs-grafana-prod: + @docker logs -f oullin_grafana + +monitor-docker-logs-db-prod: + @docker logs -f oullin_postgres_exporter + +# -------------------------------------------------------------------------------------------------------------------- # +# Status & Information Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Show status of monitoring services +monitor-status: + @printf "$(BOLD)$(CYAN)Monitoring Stack Status$(NC)\n\n" + @docker ps --filter "name=prometheus" --filter "name=grafana" --filter "name=exporter" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" + @printf "\n" + +## Show logs from all monitoring services (local) +monitor-logs: + @printf "$(BOLD)$(CYAN)Monitoring Stack Logs (local)$(NC)\n\n" + @docker compose logs -f $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL) + +## Show logs from all monitoring services (production) +monitor-logs-prod: + @printf "$(BOLD)$(CYAN)Monitoring Stack Logs (production)$(NC)\n\n" + @docker compose logs -f $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD) + +# -------------------------------------------------------------------------------------------------------------------- # +# Testing & Verification Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Run full monitoring stack test suite (local profile only) +monitor-test: + @printf "$(BOLD)$(CYAN)Running monitoring stack tests (local profile)...$(NC)\n" + @printf "$(YELLOW)Note: This target is for local development only.$(NC)\n" + @printf "$(YELLOW)For production, verify monitoring from the server directly.$(NC)\n\n" + @printf "$(BOLD)1. Checking services are running...$(NC)\n" + @docker ps --filter "name=$(PROMETHEUS_SERVICE_LOCAL)" --filter "name=$(GRAFANA_SERVICE_LOCAL)" --filter "name=$(POSTGRES_EXPORTER_SERVICE_LOCAL)" --format " ✓ {{.Names}}: {{.Status}}" || echo " $(RED)✗ Services not running$(NC)" + @printf "\n$(BOLD)2. Testing Prometheus targets...$(NC)\n" + @curl -s $(PROMETHEUS_URL)/api/v1/targets | grep -q '"health":"up"' && echo " $(GREEN)✓ Prometheus targets are UP$(NC)" || echo " $(RED)✗ Some targets are DOWN$(NC)" + @printf "\n$(BOLD)3. Testing Caddy metrics endpoint...$(NC)\n" + @curl -s $(CADDY_ADMIN_URL)/metrics | grep -q "caddy_http_requests_total" && echo " $(GREEN)✓ Caddy metrics accessible$(NC)" || echo " $(RED)✗ Caddy metrics unavailable$(NC)" + @printf "\n$(BOLD)4. Testing API metrics endpoint...$(NC)\n" + @curl -s $(API_URL)/metrics | grep -q "go_goroutines" && echo " $(GREEN)✓ API metrics accessible$(NC)" || echo " $(RED)✗ API metrics unavailable$(NC)" + @printf "\n$(BOLD)5. Testing Grafana...$(NC)\n" + @curl -s $(GRAFANA_URL)/api/health | grep -q "ok" && echo " $(GREEN)✓ Grafana is healthy$(NC)" || echo " $(RED)✗ Grafana is unhealthy$(NC)" + @printf "\n$(BOLD)$(GREEN)Test suite completed!$(NC)\n\n" + +## Verify Prometheus targets status +monitor-targets: + @printf "$(BOLD)$(CYAN)Prometheus Targets Status$(NC)\n\n" + @curl -s $(PROMETHEUS_URL)/api/v1/targets | jq -r '.data.activeTargets[] | "[\(.health | ascii_upcase)] \(.labels.job) - \(.scrapeUrl)"' || echo "$(RED)Failed to fetch targets. Is Prometheus running?$(NC)" + @printf "\n" + +## Check Prometheus configuration (local) +monitor-config: + @printf "$(BOLD)$(CYAN)Prometheus Configuration (local)$(NC)\n\n" + @docker exec oullin_prometheus_local cat /etc/prometheus/prometheus.yml + +## Check Prometheus configuration (production) +monitor-config-prod: + @printf "$(BOLD)$(CYAN)Prometheus Configuration (production)$(NC)\n\n" + @docker exec oullin_prometheus cat /etc/prometheus/prometheus.yml + +# -------------------------------------------------------------------------------------------------------------------- # +# Metrics Access Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Open Grafana in browser +monitor-grafana: + @printf "$(BOLD)$(CYAN)Opening Grafana...$(NC)\n" + @printf "URL: $(GREEN)$(GRAFANA_URL)$(NC)\n" + @printf "Credentials: admin / (set via GRAFANA_ADMIN_PASSWORD)\n\n" + @which xdg-open > /dev/null && xdg-open $(GRAFANA_URL) || which open > /dev/null && open $(GRAFANA_URL) || echo "Please open $(GRAFANA_URL) in your browser" + +## Open Prometheus in browser +monitor-prometheus: + @printf "$(BOLD)$(CYAN)Opening Prometheus...$(NC)\n" + @printf "URL: $(GREEN)$(PROMETHEUS_URL)$(NC)\n\n" + @which xdg-open > /dev/null && xdg-open $(PROMETHEUS_URL) || which open > /dev/null && open $(PROMETHEUS_URL) || echo "Please open $(PROMETHEUS_URL) in your browser" + +## Show Caddy metrics +monitor-caddy-metrics: + @printf "$(BOLD)$(CYAN)Caddy Metrics$(NC)\n\n" + @curl -s $(CADDY_ADMIN_URL)/metrics | grep "^caddy_" | head -20 + @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n" + @printf "Full metrics: $(GREEN)$(CADDY_ADMIN_URL)/metrics$(NC)\n\n" + +## Show API metrics +monitor-api-metrics: + @printf "$(BOLD)$(CYAN)API Metrics$(NC)\n\n" + @curl -s $(API_URL)/metrics | grep "^go_" | head -20 + @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n" + @printf "Full metrics: $(GREEN)$(API_URL)/metrics$(NC)\n\n" + +## Show PostgreSQL metrics (local) +monitor-db-metrics: + @printf "$(BOLD)$(CYAN)PostgreSQL Metrics (local)$(NC)\n\n" + @docker exec oullin_prometheus_local curl -s $(PG_EXPORTER_URL)/metrics | grep "^pg_" | head -20 + @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n\n" + +## Show PostgreSQL metrics (production) +monitor-db-metrics-prod: + @printf "$(BOLD)$(CYAN)PostgreSQL Metrics (production)$(NC)\n\n" + @docker exec oullin_prometheus curl -s http://postgres_exporter:9187/metrics | grep "^pg_" | head -20 + @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n\n" + +## Show all metrics endpoints +monitor-metrics: + @printf "$(BOLD)$(CYAN)Available Metrics Endpoints$(NC)\n\n" + @printf " $(GREEN)Caddy:$(NC) $(CADDY_ADMIN_URL)/metrics\n" + @printf " $(GREEN)API:$(NC) $(API_URL)/metrics\n" + @printf " $(GREEN)PostgreSQL:$(NC) $(PG_EXPORTER_URL)/metrics (internal)\n" + @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)/metrics\n\n" + +# -------------------------------------------------------------------------------------------------------------------- # +# Traffic Generation & Testing +# -------------------------------------------------------------------------------------------------------------------- # + +## Generate test traffic to populate metrics (local profile) +monitor-traffic: + @if [ -z "$(PING_USERNAME)" ] || [ -z "$(PING_PASSWORD)" ]; then \ + printf "$(RED)Missing ping credentials. Export ENV_PING_USERNAME/ENV_PING_PASSWORD or pass PING_USERNAME/PING_PASSWORD to make.$(NC)\n"; \ + exit 1; \ + fi + @printf "$(BOLD)$(CYAN)Generating test traffic (local)...$(NC)\n" + @printf "Making 100 requests to /ping endpoint...\n" + @for i in $$(seq 1 100); do \ + curl -s $(PING_AUTH_FLAG) $(API_URL)/ping > /dev/null && printf "." || printf "$(RED)✗$(NC)"; \ + sleep 0.1; \ + done + @printf "\n$(BOLD)$(GREEN)✓ Test traffic generated$(NC)\n" + @printf "\nCheck dashboards at: $(GREEN)$(GRAFANA_URL)$(NC)\n\n" + +## Generate heavy test traffic (local profile) +monitor-traffic-heavy: + @printf "$(BOLD)$(CYAN)Generating heavy test traffic (local)...$(NC)\n" + @printf "Making 500 requests with 5 concurrent connections...\n" + @for i in $$(seq 1 100); do \ + (for j in $$(seq 1 5); do curl -s $(API_URL)/ping > /dev/null & done; wait); \ + printf "."; \ + sleep 0.05; \ + done + @printf "\n$(BOLD)$(GREEN)✓ Heavy test traffic generated$(NC)\n\n" + +## Generate test traffic to populate metrics (production profile) +monitor-traffic-prod: + @printf "$(BOLD)$(CYAN)Generating test traffic (production)...$(NC)\n" + @printf "Making 100 requests to /api/ping endpoint...\n" + @for i in $$(seq 1 100); do \ + curl -s $(API_PROD_URL)/api/ping > /dev/null && printf "." || printf "$(RED)✗$(NC)"; \ + sleep 0.1; \ + done + @printf "\n$(BOLD)$(GREEN)✓ Test traffic generated$(NC)\n" + @printf "\n$(YELLOW)Note: Run this from the production server$(NC)\n" + @printf "SSH tunnel for Grafana: $(GREEN)ssh -L 3000:localhost:3000 user@server$(NC)\n\n" + +## Generate heavy test traffic (production profile) +monitor-traffic-heavy-prod: + @printf "$(BOLD)$(CYAN)Generating heavy test traffic (production)...$(NC)\n" + @printf "Making 500 requests with 5 concurrent connections...\n" + @for i in $$(seq 1 100); do \ + (for j in $$(seq 1 5); do curl -s $(API_PROD_URL)/api/ping > /dev/null & done; wait); \ + printf "."; \ + sleep 0.05; \ + done + @printf "\n$(BOLD)$(GREEN)✓ Heavy test traffic generated$(NC)\n" + @printf "\n$(YELLOW)Note: Run this from the production server$(NC)\n\n" + +# -------------------------------------------------------------------------------------------------------------------- # +# Utility Commands +# -------------------------------------------------------------------------------------------------------------------- # + +## Clean monitoring data (removes all metrics/dashboard data) - local +monitor-clean: monitor-volumes-local-check + @printf "$(BOLD)$(RED)WARNING: This will delete all monitoring data (local)!$(NC)\n" + @printf "Press Ctrl+C to cancel, or Enter to continue..." + @read + @printf "$(BOLD)$(CYAN)Stopping monitoring stack...$(NC)\n" + @docker compose --profile local down $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) + @printf "$(BOLD)$(CYAN)Removing volumes...$(NC)\n" + @docker volume rm -f $(PROMETHEUS_VOLUME_LOCAL) $(GRAFANA_VOLUME_LOCAL) || true + @printf "$(BOLD)$(GREEN)✓ Monitoring data cleaned$(NC)\n\n" + +## Clean monitoring data (removes all metrics/dashboard data) - production +monitor-clean-prod: monitor-volumes-prod-check + @printf "$(BOLD)$(RED)WARNING: This will delete all monitoring data (production)!$(NC)\n" + @printf "Press Ctrl+C to cancel, or Enter to continue..." + @read + @printf "$(BOLD)$(CYAN)Stopping monitoring stack...$(NC)\n" + @docker compose --profile prod down $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) + @printf "$(BOLD)$(CYAN)Removing volumes...$(NC)\n" + @docker volume rm -f $(PROMETHEUS_VOLUME_PROD) $(GRAFANA_VOLUME_PROD) || true + @printf "$(BOLD)$(GREEN)✓ Monitoring data cleaned$(NC)\n\n" + +## Show monitoring stack resource usage (local) +monitor-stats: + @printf "$(BOLD)$(CYAN)Monitoring Stack Resource Usage (local)$(NC)\n\n" + @docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \ + oullin_prometheus_local oullin_grafana_local oullin_postgres_exporter_local 2>/dev/null || \ + echo "$(RED)No monitoring containers running$(NC)" + @printf "\n" + +## Show monitoring stack resource usage (production) +monitor-stats-prod: + @printf "$(BOLD)$(CYAN)Monitoring Stack Resource Usage (production)$(NC)\n\n" + @docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \ + oullin_prometheus oullin_grafana oullin_postgres_exporter 2>/dev/null || \ + echo "$(RED)No monitoring containers running$(NC)" + @printf "\n" + +## Backup Prometheus data (with automatic rotation) - local +monitor-backup: monitor-volumes-local-check + @printf "$(BOLD)$(CYAN)Backing up Prometheus data (local)...$(NC)\n" + @mkdir -p $(BACKUPS_DIR) + @docker run --rm -v $(PROMETHEUS_VOLUME_LOCAL):/data -v $(BACKUPS_DIR):/backup alpine \ + tar czf /backup/prometheus-backup-$$(date +%Y%m%d-%H%M%S).tar.gz /data + @printf "$(BOLD)$(GREEN)✓ Backup created in $(BACKUPS_DIR)/$(NC)\n" + @printf "$(YELLOW)Rotating backups (keeping last 5)...$(NC)\n" + @for f in $$(ls -t $(BACKUPS_DIR)/prometheus-backup-*.tar.gz 2>/dev/null | tail -n +6); do rm -f "$$f"; done || true + @BACKUP_COUNT=$$(ls -1 $(BACKUPS_DIR)/prometheus-backup-*.tar.gz 2>/dev/null | wc -l); \ + printf "$(BOLD)$(GREEN)✓ Backup rotation complete ($${BACKUP_COUNT} backups kept)$(NC)\n\n" + +monitor-volumes-local-check: + @[ -n "$(PROMETHEUS_VOLUME_LOCAL)" ] && [ -n "$(GRAFANA_VOLUME_LOCAL)" ] || \ + { printf "$(RED)Unable to resolve monitoring volumes from docker compose config (local profile).$(NC)\n"; exit 1; } + +## Backup Prometheus data (with automatic rotation) - production +monitor-backup-prod: monitor-volumes-prod-check + @printf "$(BOLD)$(CYAN)Backing up Prometheus data (production)...$(NC)\n" + @mkdir -p $(BACKUPS_DIR) + @docker run --rm -v $(PROMETHEUS_VOLUME_PROD):/data -v $(BACKUPS_DIR):/backup alpine \ + tar czf /backup/prometheus-prod-backup-$$(date +%Y%m%d-%H%M%S).tar.gz /data + @printf "$(BOLD)$(GREEN)✓ Backup created in $(BACKUPS_DIR)/$(NC)\n" + @printf "$(YELLOW)Rotating backups (keeping last 5)...$(NC)\n" + @for f in $$(ls -t $(BACKUPS_DIR)/prometheus-prod-backup-*.tar.gz 2>/dev/null | tail -n +6); do rm -f "$$f"; done || true + @BACKUP_COUNT=$$(ls -1 $(BACKUPS_DIR)/prometheus-prod-backup-*.tar.gz 2>/dev/null | wc -l); \ + printf "$(BOLD)$(GREEN)✓ Backup rotation complete ($${BACKUP_COUNT} backups kept)$(NC)\n\n" + +monitor-volumes-prod-check: + @[ -n "$(PROMETHEUS_VOLUME_PROD)" ] && [ -n "$(GRAFANA_VOLUME_PROD)" ] || \ + { printf "$(RED)Unable to resolve monitoring volumes from docker compose config (production profile).$(NC)\n"; exit 1; } + +## Export Grafana dashboards to JSON files +monitor-export-dashboards: + @printf "$(BOLD)$(CYAN)Exporting Grafana dashboards...$(NC)\n" + @$(MONITORING_DIR)/grafana/scripts/export-dashboards.sh + +## Show monitoring help +monitor-help: + @printf "\n$(BOLD)$(CYAN)Monitoring Stack Commands$(NC)\n\n" + @printf "$(BOLD)$(BLUE)Start/Stop:$(NC)\n" + @printf " $(GREEN)monitor-up$(NC) - Start monitoring stack (local)\n" + @printf " $(GREEN)monitor-up-prod$(NC) - Start monitoring stack (production)\n" + @printf " $(GREEN)monitor-up-full$(NC) - Start full stack with monitoring (local)\n" + @printf " $(GREEN)monitor-up-full-prod$(NC) - Start full stack with monitoring (prod)\n" + @printf " $(GREEN)monitor-up-logs$(NC) - Start with logs in foreground (local)\n" + @printf " $(GREEN)monitor-up-logs-prod$(NC) - Start with logs in foreground (prod)\n" + @printf " $(GREEN)monitor-down$(NC) - Stop monitoring stack (local)\n" + @printf " $(GREEN)monitor-down-prod$(NC) - Stop monitoring stack (production)\n" + @printf " $(GREEN)monitor-down-remove$(NC) - Stop and remove containers (local)\n" + @printf " $(GREEN)monitor-down-remove-prod$(NC) - Stop and remove containers (prod)\n" + @printf " $(GREEN)monitor-restart$(NC) - Restart monitoring stack (local)\n" + @printf " $(GREEN)monitor-restart-prod$(NC) - Restart monitoring stack (prod)\n\n" + @printf "$(BOLD)$(BLUE)Docker Commands:$(NC)\n" + @printf " $(GREEN)monitor-docker-ps$(NC) - Show running monitoring containers\n" + @printf " $(GREEN)monitor-docker-config$(NC) - Show docker compose config (local)\n" + @printf " $(GREEN)monitor-docker-config-prod$(NC) - Show docker compose config (prod)\n" + @printf " $(GREEN)monitor-docker-inspect$(NC) - Inspect monitoring containers (local)\n" + @printf " $(GREEN)monitor-docker-inspect-prod$(NC) - Inspect monitoring containers (prod)\n" + @printf " $(GREEN)monitor-docker-exec-prometheus$(NC) - Shell into Prometheus container (local)\n" + @printf " $(GREEN)monitor-docker-exec-prometheus-prod$(NC)- Shell into Prometheus container (prod)\n" + @printf " $(GREEN)monitor-docker-exec-grafana$(NC) - Shell into Grafana container (local)\n" + @printf " $(GREEN)monitor-docker-exec-grafana-prod$(NC) - Shell into Grafana container (prod)\n" + @printf " $(GREEN)monitor-docker-logs-prometheus$(NC) - Docker logs for Prometheus (local)\n" + @printf " $(GREEN)monitor-docker-logs-prometheus-prod$(NC)- Docker logs for Prometheus (prod)\n" + @printf " $(GREEN)monitor-docker-logs-grafana$(NC) - Docker logs for Grafana (local)\n" + @printf " $(GREEN)monitor-docker-logs-grafana-prod$(NC) - Docker logs for Grafana (prod)\n" + @printf " $(GREEN)monitor-docker-logs-db$(NC) - Docker logs for DB exporter (local)\n" + @printf " $(GREEN)monitor-docker-logs-db-prod$(NC) - Docker logs for DB exporter (prod)\n" + @printf " $(GREEN)monitor-pull$(NC) - Pull latest monitoring images (local)\n" + @printf " $(GREEN)monitor-pull-prod$(NC) - Pull latest monitoring images (prod)\n\n" + @printf "$(BOLD)$(BLUE)Status & Logs:$(NC)\n" + @printf " $(GREEN)monitor-status$(NC) - Show status of monitoring services\n" + @printf " $(GREEN)monitor-logs$(NC) - Show logs from all services (local)\n" + @printf " $(GREEN)monitor-logs-prod$(NC) - Show logs from all services (prod)\n\n" + @printf "$(BOLD)$(BLUE)Testing:$(NC)\n" + @printf " $(GREEN)monitor-test$(NC) - Run full test suite (local only)\n" + @printf " $(GREEN)monitor-targets$(NC) - Show Prometheus targets status\n" + @printf " $(GREEN)monitor-traffic$(NC) - Generate test traffic (local)\n" + @printf " $(GREEN)monitor-traffic-heavy$(NC) - Generate heavy test traffic (local)\n" + @printf " $(GREEN)monitor-traffic-prod$(NC) - Generate test traffic (production)\n" + @printf " $(GREEN)monitor-traffic-heavy-prod$(NC) - Generate heavy test traffic (prod)\n\n" + @printf "$(BOLD)$(BLUE)Access:$(NC)\n" + @printf " $(GREEN)monitor-grafana$(NC) - Open Grafana in browser\n" + @printf " $(GREEN)monitor-prometheus$(NC) - Open Prometheus in browser\n" + @printf " $(GREEN)monitor-metrics$(NC) - Show all metrics endpoints\n" + @printf " $(GREEN)monitor-caddy-metrics$(NC) - Show Caddy metrics\n" + @printf " $(GREEN)monitor-api-metrics$(NC) - Show API metrics\n" + @printf " $(GREEN)monitor-db-metrics$(NC) - Show PostgreSQL metrics (local)\n" + @printf " $(GREEN)monitor-db-metrics-prod$(NC) - Show PostgreSQL metrics (prod)\n\n" + @printf "$(BOLD)$(BLUE)Utilities:$(NC)\n" + @printf " $(GREEN)monitor-stats$(NC) - Show resource usage (local)\n" + @printf " $(GREEN)monitor-stats-prod$(NC) - Show resource usage (prod)\n" + @printf " $(GREEN)monitor-config$(NC) - Show Prometheus config (local)\n" + @printf " $(GREEN)monitor-config-prod$(NC) - Show Prometheus config (prod)\n" + @printf " $(GREEN)monitor-backup$(NC) - Backup Prometheus data (local)\n" + @printf " $(GREEN)monitor-backup-prod$(NC) - Backup Prometheus data (prod)\n" + @printf " $(GREEN)monitor-export-dashboards$(NC) - Export Grafana dashboards to JSON\n" + @printf " $(GREEN)monitor-clean$(NC) - Clean all monitoring data (local)\n" + @printf " $(GREEN)monitor-clean-prod$(NC) - Clean all monitoring data (prod)\n\n" + @printf "$(BOLD)Quick Start:$(NC)\n" + @printf " 1. $(YELLOW)make monitor-up$(NC) - Start the stack\n" + @printf " 2. $(YELLOW)make monitor-test$(NC) - Verify everything works\n" + @printf " 3. $(YELLOW)make monitor-traffic$(NC) - Generate some traffic\n" + @printf " 4. $(YELLOW)make monitor-grafana$(NC) - Open dashboards\n\n" + @printf "$(BOLD)Docker Compose Examples:$(NC)\n" + @printf " $(YELLOW)docker compose --profile local up -d$(NC) - Start local stack\n" + @printf " $(YELLOW)docker compose --profile prod up -d$(NC) - Start prod stack\n" + @printf " $(YELLOW)docker ps --filter name=prometheus$(NC) - List containers\n" + @printf " $(YELLOW)docker exec -it oullin_prometheus_local /bin/sh$(NC) - Shell access\n\n" diff --git a/infra/metrics/README.md b/infra/metrics/README.md new file mode 100644 index 00000000..81112578 --- /dev/null +++ b/infra/metrics/README.md @@ -0,0 +1,712 @@ +# Monitoring Stack Documentation + +Complete guide for managing and monitoring the Oullin application stack with Prometheus, Grafana, and related tools. + +## Table of Contents + +1. [Overview](#overview) +2. [Quick Start](#quick-start) +3. [Security Model](#security-model) +4. [Grafana Dashboards](#grafana-dashboards) +5. [Creating Custom Dashboards](#creating-custom-dashboards) +6. [Prometheus Queries](#prometheus-queries) +7. [Troubleshooting](#troubleshooting) +8. [Maintenance & Backup](#maintenance--backup) +9. [Resources](#resources) + +**For VPS deployment instructions, see [VPS_DEPLOYMENT.md](./VPS_DEPLOYMENT.md)** + +--- + +## Overview + +### Stack Components + +- **Prometheus**: Metrics collection and time-series storage +- **Grafana**: Visualization dashboards and alerting +- **postgres_exporter**: PostgreSQL database metrics +- **Caddy Admin API**: Reverse proxy metrics + +### Pre-configured Dashboards + +Three dashboards are automatically provisioned: + +1. **Oullin - Overview** (`grafana/dashboards/oullin-overview-oullin-overview.json`) + - Caddy request rate + - PostgreSQL active connections + - HTTP requests by status code + - API memory usage and goroutines + +2. **PostgreSQL - Database Metrics** (`grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json`) + - Active connections + - Database size + - Transaction rates + - Cache hit ratio + - Lock statistics + +3. **Caddy - Proxy Metrics** (`grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json`) + - Total request rate + - Response time percentiles + - Requests by status code + - Traffic rate + - Request errors + +### Directory Structure + +```text +infra/metrics/ +├── README.md # This file +├── grafana/ +│ ├── dashboards/ # Dashboard JSON files +│ ├── provisioning/ +│ │ ├── dashboards/ # Dashboard provisioning config +│ │ └── datasources/ # Data source configuration +│ └── scripts/ +│ └── export-dashboards.sh +└── prometheus/ + ├── provisioning/ + │ ├── prometheus.yml # Production Prometheus config + │ └── prometheus.local.yml # Local Prometheus config + └── scripts/ + └── postgres-exporter-entrypoint.sh +``` + +### Configuration Consistency + +The monitoring stack is designed to maintain configuration consistency across local and production environments while respecting environment-specific differences. + +#### Shared Configuration Elements + +The following configurations are **identical** across both environments: + +1. **Grafana Settings:** + - Same Grafana version (`grafana/grafana:11.4.0`) + - Identical security settings (admin user, sign-up disabled, anonymous disabled) + - Same dashboard and datasource provisioning structure + - Same volume mount paths + +2. **Prometheus Core Settings:** + - Same Prometheus version (`prom/prometheus:v3.0.1`) + - Identical scrape interval (15s) and evaluation interval (15s) + - Same job structure (caddy, postgresql, api, prometheus) with per-environment targets + - Same metrics endpoints and paths + +3. **Postgres Exporter:** + - Same exporter version (`prometheuscommunity/postgres-exporter:v0.15.0`) + - Identical port exposure (9187) + - Same entrypoint script and secrets handling + +#### Environment-Specific Variables + +These settings **differ intentionally** based on environment: + +| Configuration | Local | Production | Reason | +|--------------|-------|------------|--------| +| **Container Names** | `oullin_*_local` | `oullin_*` | Distinguish environments | +| **Prometheus URL** | `oullin_prometheus_local:9090` | `oullin_prometheus:9090` | Network addressing | +| **Grafana Port** | `3000:3000` | `127.0.0.1:3000:3000` | Security (prod localhost-only) | +| **Prometheus Port** | `9090:9090` | `127.0.0.1:9090:9090` | Security (prod localhost-only) | +| **Data Retention** | 7 days | 30 days | Storage/cost optimization | +| **Caddy Target** | `caddy_local:9180` | `caddy_prod:9180` | Service dependencies | +| **PostgreSQL Exporter Target** | `oullin_postgres_exporter_local:9187` | `oullin_postgres_exporter:9187` | Service dependencies | +| **External Labels** | `monitor: 'oullin-local'`
`environment: 'local'` | `monitor: 'oullin-prod'`
`environment: 'production'` | Metric identification | +| **Admin API** | `127.0.0.1:2019:2019` | Not exposed | Debugging access | + +#### Environment Variable Usage + +The configuration uses environment variables to maintain consistency while adapting to each environment: + +**Grafana Datasource** (`grafana/provisioning/datasources/prometheus.yml`): +```yaml +url: ${GF_DATASOURCE_PROMETHEUS_URL} +``` + +Set via Docker Compose: +- **Local:** `GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus_local:9090` +- **Production:** `GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus:9090` + +**Required Environment Variables:** +- `GRAFANA_ADMIN_PASSWORD` - **Required**, no default (set in `.env`) +- `GF_DATASOURCE_PROMETHEUS_URL` - Set automatically by Docker Compose profile + +#### Configuration Files by Environment + +**Local Environment:** +- Prometheus: `prometheus/provisioning/prometheus.local.yml` +- Profile: `--profile local` +- Services: `prometheus_local`, `grafana_local`, `caddy_local`, `postgres_exporter_local` + +**Production Environment:** +- Prometheus: `prometheus/provisioning/prometheus.yml` +- Profile: `--profile prod` +- Services: `prometheus`, `grafana`, `caddy_prod`, `postgres_exporter` + +**Shared Across All Environments:** +- Grafana datasources: `grafana/provisioning/datasources/prometheus.yml` +- Grafana dashboards: `grafana/provisioning/dashboards/default.yml` +- Dashboard JSONs: `grafana/dashboards/*.json` +- Postgres exporter script: `prometheus/scripts/postgres-exporter-entrypoint.sh` + +--- + +## Quick Start + +### Local Development + +**Prerequisites:** +- Docker and Docker Compose installed +- `.env` file in the repository root with `GRAFANA_ADMIN_PASSWORD` set (required - no default) + - Use `make env:init` to copy `.env.example` if you need a starting point + - If `.env` already exists, edit it in place instead of appending duplicates +- Database secrets in `database/infra/secrets/` + +**Setup:** + +```bash +# 1. Set Grafana admin password in .env file +echo "GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 32)" >> .env +# (Add or update the key manually if the file already defines it.) + +# 2. Start the local monitoring stack +make monitor-up +# Or: docker compose --profile local up -d + +# 3. Access services +# Grafana: http://localhost:3000 (admin / your-password) +# Prometheus: http://localhost:9090 +# Caddy Admin: http://localhost:2019 +``` + +**Verification:** + +```bash +# Check all services are running +docker ps + +# Verify Prometheus targets are UP +make monitor-targets +# Or: curl http://localhost:9090/api/v1/targets + +# Generate test traffic +make monitor-traffic + +# View dashboards +make monitor-grafana +``` + +--- + +## Security Model + +### Critical Security Requirements + +⚠️ **IMPORTANT**: The monitoring stack includes several security considerations: + +1. **Grafana Admin Password** + - No default password allowed + - Must set `GRAFANA_ADMIN_PASSWORD` in `.env` + - Docker Compose will fail if not set + - Generate strong password: `openssl rand -base64 32` + +2. **Caddy Admin API** + - Exposes powerful administrative endpoints (`/load`, `/config`, `/stop`) + - **NO authentication** by default + - Production: Only accessible within Docker network; restrict further via firewalls/security groups when possible + - If you must expose it, configure Caddy's admin access controls (`admin.identity`, `admin.authorize`, or reverse-proxy ACLs) to require authentication + - Never expose to public internet + +3. **Service Exposure** + - Production: Services bound to `127.0.0.1` only + - Access via SSH tunneling from remote + - No direct internet exposure + +### Production Security Configuration + +**Docker Compose Production Services:** + +```yaml +grafana: + ports: + - "127.0.0.1:3000:3000" # Localhost only + +prometheus: + ports: + - "127.0.0.1:9090:9090" # Localhost only + +caddy_prod: + expose: + - "2019" # Internal network only - NOT exposed to host +``` + +**Remote Access:** + +```bash +# SSH tunnel for Grafana and Prometheus +ssh -L 3000:localhost:3000 -L 9090:localhost:9090 user@your-server + +# Access Caddy admin API (debugging only) +docker exec -it oullin_proxy_prod curl http://localhost:2019/metrics +``` + +### Security Checklist + +- ✅ `GRAFANA_ADMIN_PASSWORD` set with strong password +- ✅ Firewall configured (UFW) +- ✅ Only necessary ports exposed (22, 80, 443) +- ✅ Monitoring services NOT exposed to internet +- ✅ Docker secrets for sensitive data +- ✅ Regular backups scheduled +- ✅ Log rotation configured +- ✅ SSH key-based authentication + +--- + +## Grafana Dashboards + +### Accessing Dashboards + +**Local:** +**Production:** SSH tunnel then + +### Dashboard Files + +All dashboards are in `infra/metrics/grafana/dashboards/`: +- `oullin-overview-oullin-overview.json` +- `oullin-postgresql-postgresql-database-metrics.json` +- `oullin-caddy-caddy-proxy-metrics.json` + +### Exporting Dashboards + +Use the built-in export script: + +```bash +make monitor-export-dashboards +``` + +This will: +1. List all dashboards in Grafana +2. Let you select which to export +3. Save to `infra/metrics/grafana/dashboards/` +4. Format properly for provisioning + +### Manual Export + +1. Open your dashboard in Grafana +2. Click **"Share"** → **"Export"** tab +3. Click **"Save to file"** or **"View JSON"** +4. Save to `infra/metrics/grafana/dashboards/` +5. Restart Grafana: `make monitor-restart` + +### Updating Dashboards Safely + +To keep dashboard changes reproducible and under version control: + +1. **Start monitoring stack**: `make monitor-up` +2. **Make changes in Grafana UI**: Navigate to and edit dashboards +3. **Export your changes**: Run `./infra/metrics/grafana/scripts/export-dashboards.sh` + - Select specific dashboard or `all` to export all dashboards + - Exports are saved to `infra/metrics/grafana/dashboards/` +4. **Review the diff**: `git diff infra/metrics/grafana/dashboards/` +5. **Commit changes**: Add and commit the exported JSON files +6. **Verify**: `make monitor-restart` to ensure dashboards reload correctly + +**Warning:** Always export after making UI changes—manual edits to JSON files can work but are error-prone. + +--- + +## Creating Custom Dashboards + +### Method 1: Create in UI (Recommended) + +**Step 1:** Start Grafana + +```bash +make monitor-up +make monitor-grafana # Opens http://localhost:3000 +``` + +**Step 2:** Create dashboard + +1. Click **"+"** → **"Dashboard"** → **"Add visualization"** +2. Select **"Prometheus"** as data source +3. Write PromQL query +4. Choose visualization type (Time series, Stat, Gauge, Table) +5. Configure panel (title, description, units, thresholds) +6. Add more panels as needed +7. Save dashboard + +**Step 3:** Export + +```bash +make monitor-export-dashboards +``` + +### Method 2: Use Community Dashboards + +Grafana has thousands of pre-built dashboards at + +**Popular for our stack:** +- [9628](https://grafana.com/grafana/dashboards/9628) - PostgreSQL Database +- [455](https://grafana.com/grafana/dashboards/455) - PostgreSQL Stats +- [10826](https://grafana.com/grafana/dashboards/10826) - Go Metrics +- [6671](https://grafana.com/grafana/dashboards/6671) - Go Processes + +**Import via UI:** +1. Click **"+"** → **"Import"** +2. Enter dashboard ID +3. Select **"Prometheus"** as data source +4. Click **"Import"** + +### Dashboard Best Practices + +**Organization:** +- One dashboard per service +- Overview dashboard for high-level metrics +- Detail dashboards for deep dives +- Use tags for categorization + +**Panel Design:** +- Clear titles +- Descriptions for complex metrics +- Consistent colors +- Appropriate units (bytes, %, req/s) +- Thresholds for warnings/errors + +**Query Performance:** +- Avoid high-cardinality labels +- Use recording rules for expensive queries +- Limit time range +- Use `rate()` instead of raw counters + +--- + +## Prometheus Queries + +### API Metrics + +```promql +# Request rate +rate(promhttp_metric_handler_requests_total[5m]) + +# Memory usage +go_memstats_alloc_bytes{job="api"} + +# Goroutines (check for leaks) +go_goroutines{job="api"} + +# GC duration +rate(go_gc_duration_seconds_sum[5m]) + +# Heap allocations +rate(go_memstats_alloc_bytes_total[5m]) +``` + +### PostgreSQL Metrics + +```promql +# Active connections +pg_stat_database_numbackends + +# Database size +pg_database_size_bytes + +# Transaction rate +rate(pg_stat_database_xact_commit[5m]) + +# Cache hit ratio (should be >90%) +rate(pg_stat_database_blks_hit[5m]) / +(rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m])) + +# Rows inserted/updated/deleted +rate(pg_stat_database_tup_inserted[5m]) +rate(pg_stat_database_tup_updated[5m]) +rate(pg_stat_database_tup_deleted[5m]) +``` + +### Caddy Metrics + +```promql +# Request rate by status +sum by(code) (rate(caddy_http_requests_total[5m])) + +# Response time percentiles +histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m])) +histogram_quantile(0.99, rate(caddy_http_request_duration_seconds_bucket[5m])) + +# Error rate +sum(rate(caddy_http_request_errors_total[5m])) + +# Response traffic rate +rate(caddy_http_response_size_bytes_sum[5m]) +``` + +--- + +## Troubleshooting + +### Dashboards Don't Load + +```bash +# Check JSON syntax +jq . < infra/metrics/grafana/dashboards/my-dashboard.json + +# Check Grafana logs +docker logs oullin_grafana_local # Local +docker logs oullin_grafana # Production + +# Or view all monitoring logs +make monitor-logs # Local +make monitor-logs-prod # Production + +# Verify Prometheus connection +# Grafana UI → Settings → Data Sources → Prometheus → "Save & Test" + +# Ensure Prometheus is running +docker ps | grep prometheus +``` + +### No Data in Panels + +```bash +# Verify Prometheus is scraping targets +make monitor-targets +# Or: curl http://localhost:9090/api/v1/targets + +# Test query in Prometheus +# Open http://localhost:9090 + +# Wait a few minutes for initial data collection +``` + +### Prometheus Not Scraping + +```bash +# Check network connectivity +docker exec -it oullin_prometheus_local ping caddy_local + +# Verify service exposes metrics +docker exec -it oullin_prometheus_local curl http://caddy_local:2019/metrics + +# Check Prometheus config +docker exec -it oullin_prometheus_local cat /etc/prometheus/prometheus.yml +``` + +### Targets Show as DOWN + +```bash +# Check container networking +docker network ls +docker network inspect caddy_net + +# Check container names match Prometheus config +docker ps + +# Restart services +make monitor-restart +# Or: docker compose --profile local restart +``` + +### High Memory Usage + +```bash +# Monitor memory +docker stats + +# If Prometheus using too much memory: +# - Reduce retention time +# - Decrease scrape frequency +# - Add metric filters +``` + +### Data Not Persisting + +```bash +# Ensure volumes are configured +docker volume ls +docker volume inspect prometheus_data_local # Local +docker volume inspect prometheus_data_prod # Production +docker volume inspect grafana_data_local # Local +docker volume inspect grafana_data_prod # Production +``` + +--- + +## Maintenance & Backup + +### Backing Up Data + +**Automated backup** (recommended): + +```bash +# Runs daily via cron, keeps last 5 backups +make monitor-backup # Local environment +make monitor-backup-prod # Production environment +``` + +Backups saved to: +- **Local**: `storage/monitoring/backups/prometheus-backup-YYYYMMDD-HHMMSS.tar.gz` +- **Production**: `storage/monitoring/backups/prometheus-prod-backup-YYYYMMDD-HHMMSS.tar.gz` + +**Manual backup:** + +```bash +# Backup Prometheus data +docker run --rm -v prometheus_data_local:/data -v $(pwd)/backups:/backup alpine \ + tar czf /backup/prometheus-backup-$(date +%Y%m%d-%H%M%S).tar.gz /data +# (Use prometheus_data_prod on production hosts) + +# Backup Grafana data +docker run --rm -v grafana_data_local:/data -v $(pwd)/backups:/backup alpine \ + tar czf /backup/grafana-backup-$(date +%Y%m%d-%H%M%S).tar.gz /data +# (Use grafana_data_prod on production hosts) +``` + +### Restoring from Backup + +```bash +# Stop services +make monitor-down + +# Restore Prometheus data +# WARNING: This will DELETE all existing Prometheus data. Validate backups and consider restoring in a test environment first. +docker run --rm -v prometheus_data_local:/data -v $(pwd)/backups:/backup alpine \ + sh -c "rm -rf /data/* && tar xzf /backup/prometheus-backup-YYYYMMDD-HHMMSS.tar.gz -C /" +# (Use prometheus_data_prod on production hosts) + +# Restore Grafana data +# WARNING: This will DELETE all existing Grafana data. Keep a secondary backup if unsure. +docker run --rm -v grafana_data_local:/data -v $(pwd)/backups:/backup alpine \ + sh -c "rm -rf /data/* && tar xzf /backup/grafana-backup-YYYYMMDD-HHMMSS.tar.gz -C /" +# (Use grafana_data_prod on production hosts) + +# Restart services +make monitor-up +``` + +### Updating the Stack + +**Local environment:** +```bash +# Pull latest images +docker compose pull + +# Restart with new images +make monitor-restart +# Or: docker compose --profile local up -d +``` + +**Production environment:** +```bash +# Pull latest images +docker compose pull + +# Restart with new images +make monitor-restart-prod +# Or: docker compose --profile prod up -d +``` + +### Monitoring Resource Usage + +```bash +# CPU and Memory usage +docker stats + +# Disk usage by container +docker system df -v + +# Container logs size +sudo du -sh /var/lib/docker/containers/*/*-json.log +``` + +### Cleaning Up Old Data + +Prometheus automatically handles retention based on `--storage.tsdb.retention.time` (30d prod, 7d local). + +Manual cleanup: + +```bash +# Stop Prometheus +docker compose stop prometheus_local + +# Clean data +docker run --rm -v prometheus_data_local:/data alpine rm -rf /data/* +# (Use prometheus_data_prod on production hosts) + +# Restart +docker compose --profile local up -d prometheus_local +``` + +--- + +## Resources + +### Official Documentation + +- [Prometheus Documentation](https://prometheus.io/docs/) +- [Grafana Documentation](https://grafana.com/docs/) +- [Grafana Dashboards](https://grafana.com/grafana/dashboards/) +- [Caddy Metrics](https://caddyserver.com/docs/metrics) +- [PostgreSQL Exporter](https://github.com/prometheus-community/postgres_exporter) +- [PromQL Basics](https://prometheus.io/docs/prometheus/latest/querying/basics/) +- [Grafonnet Library](https://github.com/grafana/grafonnet-lib) + +### Quick Reference Commands + +```bash +# Start monitoring stack +make monitor-up # Local +make monitor-up-prod # Production + +# Access services +make monitor-grafana # Open Grafana +make monitor-prometheus # Open Prometheus + +# Check status +make monitor-status # Service health +make monitor-targets # Prometheus targets + +# Generate traffic +make monitor-traffic # Local +make monitor-traffic-prod # Production + +# View logs +make monitor-logs # All services (local) +make monitor-logs-prod # All services (production) + +# Individual container logs +docker logs oullin_grafana_local # Grafana (local) +docker logs oullin_prometheus_local # Prometheus (local) +docker logs oullin_grafana # Grafana (production) +docker logs oullin_prometheus # Prometheus (production) + +# Maintenance +make monitor-backup # Backup Prometheus data +make monitor-restart # Restart services (local) +make monitor-restart-prod # Restart services (production) +make monitor-export-dashboards + +# Cleanup +make monitor-down # Stop services (local) +make monitor-down-prod # Stop services (production) +make monitor-clean # Clean up data (local) +make monitor-clean-prod # Clean up data (production) +``` + +### Production Deployment + +For complete VPS deployment instructions including firewall setup, SSL configuration, and production best practices, see [VPS_DEPLOYMENT.md](./VPS_DEPLOYMENT.md). + +--- + +## Next Steps + +1. **Set up Alerting**: Configure Prometheus Alertmanager for critical metrics +2. **Add Custom Metrics**: Instrument your API with custom business metrics +3. **Create Custom Dashboards**: Build dashboards specific to your use case +4. **Configure Recording Rules**: Pre-compute expensive queries +5. **Implement SLOs**: Define and monitor Service Level Objectives +6. **Export and Share**: Share dashboard configurations with your team + +--- + +For questions or issues, please check the [Troubleshooting](#troubleshooting) section or refer to the official documentation links above. diff --git a/infra/metrics/VPS_DEPLOYMENT.md b/infra/metrics/VPS_DEPLOYMENT.md new file mode 100644 index 00000000..11ca467d --- /dev/null +++ b/infra/metrics/VPS_DEPLOYMENT.md @@ -0,0 +1,436 @@ +# VPS Deployment Guide + +Complete guide for deploying the Oullin monitoring stack on an Ubuntu VPS (Hostinger or similar). + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Initial Server Setup](#initial-server-setup) +3. [Install Docker and Docker Compose](#install-docker-and-docker-compose) +4. [Install Make](#install-make) +5. [Clone Your Repository](#clone-your-repository) +6. [Configure Environment Variables](#configure-environment-variables) +7. [Set Up Docker Secrets](#set-up-docker-secrets) +8. [Configure Firewall](#configure-firewall) +9. [Deploy the Monitoring Stack](#deploy-the-monitoring-stack) +10. [Verify Monitoring Stack](#verify-monitoring-stack) +11. [Access Grafana Remotely](#access-grafana-remotely) +12. [Production Considerations](#production-considerations) +13. [Generate Test Traffic](#generate-test-traffic) +14. [VPS Troubleshooting](#vps-troubleshooting) +15. [Updating the Stack](#updating-the-stack) +16. [Installing Fail2ban](#installing-fail2ban) + +--- + +## Prerequisites + +- Hostinger VPS with Ubuntu 20.04 or 22.04 (or similar VPS provider) +- SSH access to your VPS +- Domain name (optional, but recommended for SSL) +- At least 2GB RAM and 20GB storage + +--- + +## Initial Server Setup + +Connect to your VPS: + +```bash +ssh root@your-vps-ip +``` + +Update the system: + +```bash +apt update && apt upgrade -y +``` + +Create a non-root user: + +```bash +# Create user +adduser deployer + +# Add to sudo group +usermod -aG sudo deployer + +# Switch to new user +su - deployer +``` + +--- + +## Install Docker and Docker Compose + +Install required packages: + +```bash +sudo apt install -y apt-transport-https ca-certificates curl software-properties-common +``` + +Add Docker's official GPG key: + +```bash +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg +``` + +Add Docker repository: + +```bash +echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +``` + +Install Docker: + +```bash +sudo apt update +sudo apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin +``` + +Add your user to the docker group: + +```bash +sudo usermod -aG docker ${USER} +``` + +Log out and back in, then verify: + +```bash +docker --version +docker compose version +``` + +--- + +## Install Make + +```bash +sudo apt install -y make +``` + +--- + +## Clone Your Repository + +```bash +cd ~ +git clone https://github.com/yourusername/your-repo.git +cd your-repo +``` + +--- + +## Configure Environment Variables + +Create your `.env` file with production settings: + +```bash +cat > .env << 'EOF' +# Database Configuration +POSTGRES_USER=your_db_user +POSTGRES_PASSWORD=your_strong_db_password +POSTGRES_DB=your_database_name + +# Grafana Configuration (REQUIRED - no default) +GRAFANA_ADMIN_PASSWORD=your_very_strong_grafana_password + +# Production Domain (optional, for SSL) +DOMAIN=your-domain.com + +# Environment +ENVIRONMENT=production +EOF +``` + +**Security Notes:** +- Use strong, unique passwords +- Never commit `.env` to version control +- Consider using a password manager + +--- + +## Set Up Docker Secrets + +Avoid piping credentials through `echo` because the literal values end up in your shell history. Use one of the safer patterns below. + +### Option 1: Read secrets from secure input + +```bash +# Prompt won't echo characters and won't touch shell history +read -s -p "Enter database password: " DB_PASSWORD && echo + +echo "$DB_PASSWORD" | docker secret create pg_password - 2>/dev/null || \ + printf "%s" "$DB_PASSWORD" > secrets/pg_password + +unset DB_PASSWORD +``` + +Repeat the same pattern for usernames or other sensitive values you do not want stored on disk. + +### Option 2: Write files directly + +```bash +mkdir -p secrets +printf "your_db_user" > secrets/pg_username +printf "your_strong_db_password" > secrets/pg_password +printf "your_database_name" > secrets/pg_dbname +chmod 600 secrets/* +``` + +Store these files somewhere secure (e.g., `pass`, `1Password CLI`, `sops`) and only copy them onto the server when needed. + +--- + +## Configure Firewall + +Set up UFW: + +```bash +# Enable UFW +sudo ufw --force enable + +# Allow SSH (IMPORTANT: Do this first!) +sudo ufw allow 22/tcp + +# Allow HTTP and HTTPS (for Caddy) +sudo ufw allow 80/tcp +sudo ufw allow 443/tcp + +# Verify rules +sudo ufw status +``` + +**Do NOT expose Prometheus (9090), Grafana (3000), or postgres_exporter (9187) ports!** + +--- + +## Deploy the Monitoring Stack + +```bash +# Start with production profile +make monitor-up-prod +# Or: docker compose --profile prod up -d +``` + +Verify services: + +```bash +docker compose ps +``` + +Expected containers: +- `oullin_prometheus` +- `oullin_grafana` +- `oullin_postgres_exporter` +- `oullin_proxy_prod` +- `oullin_db` + +--- + +## Verify Monitoring Stack + +Check Prometheus targets: + +```bash +curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}' +``` + +All should show `"health": "up"`. + +--- + +## Access Grafana Remotely + +From your local machine: + +```bash +ssh -L 3000:localhost:3000 deployer@your-vps-ip +``` + +Then open `http://localhost:3000` in your browser. + +**Login:** +- Username: `admin` +- Password: Value from `GRAFANA_ADMIN_PASSWORD` + +--- + +## Production Considerations + +### Enable Automatic Backups + +Schedule daily backups: + +```bash +crontab -e +``` + +Add: + +# NOTE: Update /home/deployer/your-repo to your actual repository path +```cron +# Run daily at 2 AM +0 2 * * * cd /home/deployer/your-repo && make monitor-backup-prod >> /var/log/prometheus-backup.log 2>&1 +``` + +### Monitor Disk Space + +```bash +# Check disk usage +df -h + +# Check Prometheus data size +docker exec oullin_prometheus du -sh /prometheus +``` + +### Configure Log Rotation + +```bash +sudo tee /etc/docker/daemon.json > /dev/null << 'EOF' +{ + "log-driver": "json-file", + "log-opts": { + "max-size": "10m", + "max-file": "3" + } +} +EOF + +sudo systemctl restart docker +make monitor-restart-prod +``` + +### Enable SSL/TLS (Optional) + +If you have a domain, configure Caddy for automatic HTTPS. + +Edit `infra/caddy/Caddyfile.prod`: + +```caddyfile +your-domain.com { + reverse_proxy api:8080 + + log { + output file /var/log/caddy/access.log + } +} + +# Admin API (internal only) +127.0.0.1:2019 { + admin { + metrics + } +} +``` + +Caddy will automatically obtain Let's Encrypt certificates. + +--- + +## Generate Test Traffic + +```bash +make monitor-traffic-prod +``` + +Wait a few minutes for data to appear in Grafana. + +--- + +## VPS Troubleshooting + +### Services won't start + +```bash +# View logs from monitoring services +make monitor-logs # Local: all services +make monitor-logs-prod # Production: all services + +# Or view individual container logs +docker logs oullin_grafana +docker logs oullin_prometheus + +# Check Docker daemon +sudo systemctl status docker +``` + +### Can't connect via SSH tunnel + +```bash +# Verify Grafana is listening +docker exec oullin_grafana netstat -tlnp | grep 3000 + +# Check if port is already in use locally +lsof -i :3000 +``` + +### Prometheus targets are down + +```bash +# Check DNS resolution +docker exec oullin_prometheus nslookup oullin_proxy_prod +docker exec oullin_prometheus nslookup oullin_postgres_exporter + +# Verify network +docker network inspect caddy_net oullin_net +``` + +### Out of disk space + +```bash +# Clean up Docker +docker system prune -a --volumes + +# Rotate backups (keeps last 5) +make monitor-backup + +# Clear old Prometheus data +docker exec oullin_prometheus rm -rf /prometheus/wal/* +``` + +--- + +## Updating the Stack + +```bash +cd ~/your-repo +git pull origin main + +make monitor-down-prod +make monitor-up-prod +``` + +--- + +## Installing Fail2ban + +```bash +sudo apt install -y fail2ban +sudo systemctl start fail2ban +sudo systemctl enable fail2ban +sudo fail2ban-client status sshd +``` + +--- + +## Production Checklist + +- ✅ `GRAFANA_ADMIN_PASSWORD` set in `.env` +- ✅ Firewall configured (UFW) +- ✅ Services bound to localhost +- ✅ SSH tunneling configured +- ✅ Backups scheduled (cron) +- ✅ Log rotation configured +- ✅ SSL/TLS enabled (if domain) +- ✅ Fail2ban installed +- ✅ All Prometheus targets UP +- ✅ Dashboards accessible +- ✅ Retention policies set +- ✅ Volumes backed up regularly + +--- + +## Additional Resources + +For monitoring-specific documentation, see [README.md](./README.md). diff --git a/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json b/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json new file mode 100644 index 00000000..47c068c4 --- /dev/null +++ b/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json @@ -0,0 +1,482 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum(rate(caddy_http_request_duration_seconds_count[5m]))", + "legendFormat": "Requests/s", + "refId": "A" + } + ], + "title": "Total Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Response Time (p95)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum by(code) (rate(caddy_http_request_duration_seconds_count[5m]))", + "legendFormat": "{{code}}", + "refId": "A" + } + ], + "title": "Requests by Status Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(caddy_http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, rate(caddy_http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Request Duration Percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "rate(caddy_http_response_size_bytes_sum[5m])", + "legendFormat": "Response", + "refId": "A" + } + ], + "title": "Response Traffic Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum(rate(caddy_http_request_errors_total[5m])) or vector(0)", + "legendFormat": "Errors/s", + "refId": "A" + } + ], + "title": "Request Errors", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["oullin", "caddy", "proxy"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Caddy - Proxy Metrics", + "uid": "oullin-caddy", + "version": 1 +} diff --git a/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json b/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json new file mode 100644 index 00000000..1a2e4d5e --- /dev/null +++ b/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json @@ -0,0 +1,395 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum(rate(caddy_http_request_duration_seconds_count[5m]))", + "legendFormat": "Caddy Requests/s", + "refId": "A" + } + ], + "title": "Caddy Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_stat_database_numbackends{datname=~\".*\"}", + "legendFormat": "DB Connections", + "refId": "A" + } + ], + "title": "PostgreSQL Active Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "sum by(code) (rate(caddy_http_request_duration_seconds_count[5m]))", + "legendFormat": "{{code}}", + "refId": "A" + } + ], + "title": "HTTP Requests by Status Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=\"api\"}", + "legendFormat": "API Memory Usage", + "refId": "A" + } + ], + "title": "API Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "go_goroutines{job=\"api\"}", + "legendFormat": "Goroutines", + "refId": "A" + } + ], + "title": "API Goroutines", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["oullin", "overview"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Oullin - Overview", + "uid": "oullin-overview", + "version": 1 +} diff --git a/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json b/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json new file mode 100644 index 00000000..abfc3662 --- /dev/null +++ b/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json @@ -0,0 +1,600 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_stat_database_numbackends", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Active Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_database_size_bytes", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Database Size", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "rate(pg_stat_database_xact_commit[5m])", + "legendFormat": "Commits/s", + "refId": "A" + } + ], + "title": "Transaction Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_stat_database_conflicts", + "legendFormat": "Conflicts", + "refId": "A" + } + ], + "title": "Conflicts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "rate(pg_stat_database_tup_inserted[5m])", + "legendFormat": "Inserts - {{datname}}", + "refId": "A" + }, + { + "expr": "rate(pg_stat_database_tup_updated[5m])", + "legendFormat": "Updates - {{datname}}", + "refId": "B" + }, + { + "expr": "rate(pg_stat_database_tup_deleted[5m])", + "legendFormat": "Deletes - {{datname}}", + "refId": "C" + } + ], + "title": "Database Operations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_stat_database_numbackends", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Active Connections Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "rate(pg_stat_database_blks_hit[5m]) / (rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Cache Hit Ratio", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "pg_locks_count", + "legendFormat": "{{mode}} - {{datname}}", + "refId": "A" + } + ], + "title": "Database Locks", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["oullin", "postgresql", "database"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "PostgreSQL - Database Metrics", + "uid": "oullin-postgresql", + "version": 1 +} diff --git a/infra/metrics/grafana/provisioning/dashboards/default.yml b/infra/metrics/grafana/provisioning/dashboards/default.yml new file mode 100644 index 00000000..45fb2660 --- /dev/null +++ b/infra/metrics/grafana/provisioning/dashboards/default.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Oullin Dashboards' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true diff --git a/infra/metrics/grafana/provisioning/datasources/prometheus.yml b/infra/metrics/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..c9be740e --- /dev/null +++ b/infra/metrics/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,14 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: ${GF_DATASOURCE_PROMETHEUS_URL} + isDefault: true + editable: true + allowUiUpdates: true + jsonData: + timeInterval: 15s + queryTimeout: 60s diff --git a/infra/metrics/grafana/scripts/export-dashboards.sh b/infra/metrics/grafana/scripts/export-dashboards.sh new file mode 100755 index 00000000..43e53a28 --- /dev/null +++ b/infra/metrics/grafana/scripts/export-dashboards.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Helper script to export Grafana dashboards + +set -e + +GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}" +GRAFANA_USER="${GRAFANA_USER:-admin}" +GRAFANA_PASSWORD="${GRAFANA_PASSWORD:-admin}" +OUTPUT_DIR="./infra/metrics/grafana/dashboards" + +echo "================================" +echo "Grafana Dashboard Export Tool" +echo "================================" +echo "" + +# Check if Grafana is running +if ! curl -s "$GRAFANA_URL/api/health" > /dev/null 2>&1; then + echo "Error: Grafana is not accessible at $GRAFANA_URL" + echo "Please start Grafana with: make monitor-up" + exit 1 +fi + +# List all dashboards +echo "Fetching dashboard list..." +DASHBOARDS=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \ + "$GRAFANA_URL/api/search?type=dash-db" | jq -r '.[] | "\(.uid) \(.title)"') + +if [ -z "$DASHBOARDS" ]; then + echo "No dashboards found in Grafana" + exit 0 +fi + +echo "" +echo "Available dashboards:" +echo "---------------------" +echo "$DASHBOARDS" | nl +echo "" + +# Ask user which dashboard to export +read -r -p "Enter dashboard number to export (or 'all' for all dashboards): " SELECTION + +# Validate selection +if [ "$SELECTION" != "all" ]; then + # Check if selection is a valid number + if ! [[ "$SELECTION" =~ ^[0-9]+$ ]]; then + echo "Error: Please enter a valid number or 'all'" + exit 1 + fi + + # Check if selection is within valid range + DASHBOARD_COUNT=$(echo "$DASHBOARDS" | wc -l) + if [ "$SELECTION" -lt 1 ] || [ "$SELECTION" -gt "$DASHBOARD_COUNT" ]; then + echo "Error: Selection out of range (1-$DASHBOARD_COUNT)" + exit 1 + fi +fi + +if [ "$SELECTION" = "all" ]; then + # Export all dashboards + echo "" + echo "Exporting all dashboards..." + + EXPORT_COUNT=0 + FAIL_COUNT=0 + + while IFS= read -r line; do + UID=$(echo "$line" | awk '{print $1}') + TITLE=$(echo "$line" | cut -d' ' -f2-) + FILENAME="${UID}-$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd '[:alnum:]-').json" + + echo -n "Exporting: $TITLE -> $FILENAME ... " + + # Temporarily disable errexit for this operation + set +e + if curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \ + "$GRAFANA_URL/api/dashboards/uid/$UID" | \ + jq 'del(.meta) | .dashboard.id = null | .overwrite = true' > \ + "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then + + # Verify the file is valid JSON and not empty + if [ -s "$OUTPUT_DIR/$FILENAME" ] && jq empty "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then + echo "✓ Success" + ((EXPORT_COUNT++)) + else + echo "✗ Failed (invalid JSON)" + rm -f "$OUTPUT_DIR/$FILENAME" + ((FAIL_COUNT++)) + fi + else + echo "✗ Failed (export error)" + rm -f "$OUTPUT_DIR/$FILENAME" + ((FAIL_COUNT++)) + fi + set -e + done <<< "$DASHBOARDS" + + echo "" + echo "Export summary: $EXPORT_COUNT succeeded, $FAIL_COUNT failed" + + if [ $FAIL_COUNT -gt 0 ]; then + exit 1 + fi + +else + # Export single dashboard + SELECTED_LINE=$(echo "$DASHBOARDS" | sed -n "${SELECTION}p") + + if [ -z "$SELECTED_LINE" ]; then + echo "Error: Invalid selection" + exit 1 + fi + + UID=$(echo "$SELECTED_LINE" | awk '{print $1}') + TITLE=$(echo "$SELECTED_LINE" | cut -d' ' -f2-) + FILENAME="${UID}-$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd '[:alnum:]-').json" + + echo "" + echo "Exporting: $TITLE" + + # Temporarily disable errexit for this operation + set +e + if curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \ + "$GRAFANA_URL/api/dashboards/uid/$UID" | \ + jq 'del(.meta) | .dashboard.id = null | .overwrite = true' > \ + "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then + + # Verify the file is valid JSON and not empty + if [ -s "$OUTPUT_DIR/$FILENAME" ] && jq empty "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then + echo "✓ Saved to: $OUTPUT_DIR/$FILENAME" + else + echo "✗ Error: Export produced invalid JSON" + rm -f "$OUTPUT_DIR/$FILENAME" + exit 1 + fi + else + echo "✗ Error: Failed to export dashboard" + rm -f "$OUTPUT_DIR/$FILENAME" + exit 1 + fi + set -e +fi + +echo "" +echo "Export complete!" +echo "" +echo "To reload dashboards:" +echo " make monitor-restart" diff --git a/infra/metrics/prometheus/provisioning/prometheus.local.yml b/infra/metrics/prometheus/provisioning/prometheus.local.yml new file mode 100644 index 00000000..4c661cbb --- /dev/null +++ b/infra/metrics/prometheus/provisioning/prometheus.local.yml @@ -0,0 +1,41 @@ +# Prometheus configuration for local development/testing +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'oullin-local' + environment: 'local' + +scrape_configs: + # Caddy metrics endpoint (dedicated /metrics endpoint, not admin API) + - job_name: 'caddy' + static_configs: + - targets: ['caddy_local:9180'] + labels: + service: 'caddy' + environment: 'local' + + # PostgreSQL database metrics via postgres_exporter (local) + - job_name: 'postgresql' + static_configs: + - targets: ['oullin_postgres_exporter_local:9187'] + labels: + service: 'postgresql' + environment: 'local' + + # API metrics endpoint (local) + - job_name: 'api' + metrics_path: '/metrics' + static_configs: + - targets: ['api:8080'] + labels: + service: 'api' + environment: 'local' + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + labels: + service: 'prometheus' + environment: 'local' diff --git a/infra/metrics/prometheus/provisioning/prometheus.yml b/infra/metrics/prometheus/provisioning/prometheus.yml new file mode 100644 index 00000000..18ef3a2c --- /dev/null +++ b/infra/metrics/prometheus/provisioning/prometheus.yml @@ -0,0 +1,41 @@ +# Prometheus configuration for monitoring Caddy, API, and PostgreSQL +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'oullin-prod' + environment: 'production' + +scrape_configs: + # Caddy metrics endpoint (dedicated /metrics endpoint, not admin API) + - job_name: 'caddy' + static_configs: + - targets: ['caddy_prod:9180'] + labels: + service: 'caddy' + environment: 'production' + + # PostgreSQL database metrics via postgres_exporter + - job_name: 'postgresql' + static_configs: + - targets: ['oullin_postgres_exporter:9187'] + labels: + service: 'postgresql' + environment: 'production' + + # API metrics endpoint + - job_name: 'api' + metrics_path: '/metrics' + static_configs: + - targets: ['api:8080'] + labels: + service: 'api' + environment: 'production' + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + labels: + service: 'prometheus' + environment: 'production' diff --git a/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh b/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh new file mode 100755 index 00000000..55f48fce --- /dev/null +++ b/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh @@ -0,0 +1,20 @@ +#!/bin/sh +set -e + +# URL-encode function using od and tr (POSIX-compliant) +# Required for credentials containing special characters (@, :, /, ?, =) +urlencode() { + string="$1" + printf '%s' "$string" | od -An -tx1 | tr ' ' % | tr -d '\n' +} + +# Read Docker secrets separately for better error diagnostics +PG_USER=$(cat /run/secrets/pg_username) +PG_PASSWORD=$(cat /run/secrets/pg_password) +PG_DBNAME=$(cat /run/secrets/pg_dbname) + +# Construct DATA_SOURCE_NAME with URL-encoded credentials +export DATA_SOURCE_NAME="postgresql://$(urlencode "$PG_USER"):$(urlencode "$PG_PASSWORD")@api-db:5432/$(urlencode "$PG_DBNAME")?sslmode=require" + +# Execute postgres_exporter with any additional arguments +exec /bin/postgres_exporter "$@" diff --git a/metal/kernel/app.go b/metal/kernel/app.go index f4066948..c4f10fb4 100644 --- a/metal/kernel/app.go +++ b/metal/kernel/app.go @@ -87,6 +87,7 @@ func (a *App) Boot() { modem.KeepAlive() modem.KeepAliveDB() + modem.Metrics() modem.Profile() modem.Experience() modem.Projects() diff --git a/metal/router/router.go b/metal/router/router.go index 0c68015b..02dab599 100644 --- a/metal/router/router.go +++ b/metal/router/router.go @@ -92,6 +92,17 @@ func (r *Router) KeepAliveDB() { r.Mux.HandleFunc("GET /ping-db", apiHandler) } +func (r *Router) Metrics() { + metricsHandler := handler.NewMetricsHandler() + + // Metrics endpoint blocked from public access by Caddy (see @protected matcher in Caddyfile) + // Only accessible internally via direct container access (api:8080/metrics) + // Prometheus scrapes via internal DNS without going through Caddy's public listener + r.Mux.HandleFunc("GET /metrics", func(w http.ResponseWriter, req *http.Request) { + _ = metricsHandler.Handle(w, req) + }) +} + func (r *Router) Profile() { maker := handler.NewProfileHandler diff --git a/storage/monitoring/backups/.gitkeep b/storage/monitoring/backups/.gitkeep new file mode 100644 index 00000000..5aab5f49 --- /dev/null +++ b/storage/monitoring/backups/.gitkeep @@ -0,0 +1 @@ +# Prometheus backups stored here