diff --git a/.env.example b/.env.example
index f798bd7f..5fd724fc 100644
--- a/.env.example
+++ b/.env.example
@@ -34,5 +34,13 @@ ENV_DOCKER_USER_GROUP="ggroup"
ENV_PING_USERNAME=
ENV_PING_PASSWORD=
+# --- HTTP Server
+ENV_HTTP_PORT=8080
+
# --- SEO: SPA application directory
ENV_SPA_DIR=
+ENV_SPA_IMAGES_DIR=
+
+# --- Monitoring: Grafana admin password
+# REQUIRED for Grafana dashboard access
+GRAFANA_ADMIN_PASSWORD=
diff --git a/.gitignore b/.gitignore
index 17ac5765..c1d9c088 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,8 +10,8 @@ storage/seo/*.*
!storage/seo/.gitkeep
# --- [Caddy]: mtls
-caddy/mtls/*.*
-!caddy/mtls/.gitkeep
+infra/caddy/mtls/*.*
+!infra/caddy/mtls/.gitkeep
# --- [API]: Bin
bin/*
diff --git a/Makefile b/Makefile
index 70b54ce6..694ddc59 100644
--- a/Makefile
+++ b/Makefile
@@ -34,14 +34,15 @@ CGO_ENABLED := 1
# -------------------------------------------------------------------------------------------------------------------- #
# -------------------------------------------------------------------------------------------------------------------- #
-include ./metal/makefile/helpers.mk
-include ./metal/makefile/env.mk
-include ./metal/makefile/db.mk
-include ./metal/makefile/app.mk
-include ./metal/makefile/logs.mk
-include ./metal/makefile/build.mk
-include ./metal/makefile/infra.mk
-include ./metal/makefile/caddy.mk
+include ./infra/makefile/helpers.mk
+include ./infra/makefile/env.mk
+include ./infra/makefile/db.mk
+include ./infra/makefile/app.mk
+include ./infra/makefile/logs.mk
+include ./infra/makefile/build.mk
+include ./infra/makefile/infra.mk
+include ./infra/makefile/caddy.mk
+include ./infra/makefile/monitor.mk
# -------------------------------------------------------------------------------------------------------------------- #
# -------------------------------------------------------------------------------------------------------------------- #
@@ -104,6 +105,14 @@ help:
@printf "$(BOLD)$(BLUE)Caddy Commands:$(NC)\n"
@printf " $(BOLD)$(GREEN)caddy-gen-cert$(NC) : Generate the caddy's mtls certificates.\n"
@printf " $(BOLD)$(GREEN)caddy-del-cert$(NC) : Remove the caddy's mtls certificates.\n"
- @printf " $(BOLD)$(GREEN)caddy-validate$(NC) : Validates caddy's files syntax.\n"
+ @printf " $(BOLD)$(GREEN)caddy-validate$(NC) : Validates caddy's files syntax.\n\n"
+
+ @printf "$(BOLD)$(BLUE)Monitoring Commands:$(NC)\n"
+ @printf " $(BOLD)$(GREEN)monitor-up$(NC) : Start the monitoring stack (Prometheus, Grafana).\n"
+ @printf " $(BOLD)$(GREEN)monitor-down$(NC) : Stop the monitoring stack.\n"
+ @printf " $(BOLD)$(GREEN)monitor-status$(NC) : Show status of monitoring services.\n"
+ @printf " $(BOLD)$(GREEN)monitor-test$(NC) : Run monitoring stack test suite.\n"
+ @printf " $(BOLD)$(GREEN)monitor-grafana$(NC) : Open Grafana dashboards in browser.\n"
+ @printf " $(BOLD)$(GREEN)monitor-help$(NC) : Show detailed monitoring commands.\n"
@printf "$(NC)\n"
diff --git a/docker-compose.yml b/docker-compose.yml
index 513b806a..a23ffc80 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,14 @@ volumes:
caddy_config:
go_mod_cache:
driver: local
+ prometheus_data_prod:
+ driver: local
+ prometheus_data_local:
+ driver: local
+ grafana_data_prod:
+ driver: local
+ grafana_data_local:
+ driver: local
# --- DB: Define a named volume at the top level.
# Docker will manage its lifecycle.
@@ -30,7 +38,7 @@ services:
caddy_prod:
image: api-caddy_prod
build:
- context: ./caddy
+ context: ./infra/caddy
dockerfile: Dockerfile
args:
- CADDY_VERSION=2.10.2
@@ -40,16 +48,27 @@ services:
restart: unless-stopped
depends_on:
- api
+
+ # --- The 443:443/udp is required for HTTP/3
+ # NOTES:
+ # - Admin API (2019) listens on all interfaces but is NOT published to host
+ # - Prometheus scrapes metrics from dedicated endpoint (9180) via Docker internal DNS
ports:
- "80:80"
- "443:443"
- "443:443/udp" # Required for HTTP/3
+ # NOTE: Admin API (2019) is NOT published to host (internal Docker network only)
+ # Prometheus scrapes Caddy metrics from :9180 via Docker internal DNS
+
+ # --- Dedicated /metrics endpoint for Prometheus (internal network only)
+ expose:
+ - "9180"
volumes:
- caddy_data:/data
- caddy_config:/config
- - ./caddy/Caddyfile.prod:/etc/caddy/Caddyfile
+ - ./infra/caddy/Caddyfile.prod:/etc/caddy/Caddyfile
- ${CADDY_LOGS_PATH}:/var/log/caddy
- - ./caddy/mtls:/etc/caddy/mtls:ro
+ - ./infra/caddy/mtls:/etc/caddy/mtls:ro
networks:
caddy_net:
aliases:
@@ -57,7 +76,7 @@ services:
caddy_local:
build:
- context: ./caddy
+ context: ./infra/caddy
dockerfile: Dockerfile
args:
- CADDY_VERSION=2.10.2
@@ -68,15 +87,247 @@ services:
depends_on:
- api
ports:
- - "8080:80"
+ - "18080:80"
- "8443:443"
+ - "127.0.0.1:2019:2019" # Admin API - localhost only for debugging
+
+ # --- Dedicated /metrics endpoint for Prometheus (internal network only)
+ expose:
+ - "9180"
+
volumes:
- caddy_data:/data
- caddy_config:/config
- - ./caddy/mtls:/etc/caddy/mtls:ro
- - ./caddy/Caddyfile.local:/etc/caddy/Caddyfile
+ - ./infra/caddy/mtls:/etc/caddy/mtls:ro
+ - ./infra/caddy/Caddyfile.local:/etc/caddy/Caddyfile
+ networks:
+ - caddy_net
+
+ prometheus:
+ image: prom/prometheus:v3.0.1
+ profiles: ["prod"]
+ container_name: oullin_prometheus
+ restart: unless-stopped
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--storage.tsdb.retention.time=30d'
+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+ - '--web.console.templates=/usr/share/prometheus/consoles'
+ ports:
+ - "127.0.0.1:9090:9090"
+ volumes:
+ - ./infra/metrics/prometheus/provisioning/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - prometheus_data_prod:/prometheus
+ networks:
+ - caddy_net
+ - oullin_net
+ depends_on:
+ caddy_prod:
+ condition: service_started
+ postgres_exporter:
+ condition: service_healthy
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ deploy:
+ resources:
+ limits:
+ cpus: '1.0'
+ memory: 1G
+ reservations:
+ cpus: '0.25'
+ memory: 256M
+
+ prometheus_local:
+ image: prom/prometheus:v3.0.1
+ profiles: ["local"]
+ container_name: oullin_prometheus_local
+ restart: unless-stopped
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--storage.tsdb.retention.time=7d'
+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+ - '--web.console.templates=/usr/share/prometheus/consoles'
+ ports:
+ - "9090:9090"
+ volumes:
+ - ./infra/metrics/prometheus/provisioning/prometheus.local.yml:/etc/prometheus/prometheus.yml:ro
+ - prometheus_data_local:/prometheus
+ networks:
+ - caddy_net
+ - oullin_net
+ depends_on:
+ caddy_local:
+ condition: service_started
+ postgres_exporter_local:
+ condition: service_healthy
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ deploy:
+ resources:
+ limits:
+ cpus: '1.0'
+ memory: 1G
+ reservations:
+ cpus: '0.25'
+ memory: 256M
+
+ postgres_exporter:
+ image: prometheuscommunity/postgres-exporter:v0.15.0
+ profiles: ["prod"]
+ container_name: oullin_postgres_exporter
+ restart: unless-stopped
+ entrypoint: ["/postgres-exporter-entrypoint.sh"]
+ volumes:
+ - ./infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh:/postgres-exporter-entrypoint.sh:ro
+ secrets:
+ - pg_username
+ - pg_password
+ - pg_dbname
+ networks:
+ - oullin_net
+ - caddy_net
+ depends_on:
+ api-db:
+ condition: service_healthy
+ expose:
+ - "9187"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9187/"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ deploy:
+ resources:
+ limits:
+ cpus: '0.25'
+ memory: 128M
+ reservations:
+ cpus: '0.05'
+ memory: 32M
+
+ postgres_exporter_local:
+ image: prometheuscommunity/postgres-exporter:v0.15.0
+ profiles: ["local"]
+ container_name: oullin_postgres_exporter_local
+ restart: unless-stopped
+ entrypoint: ["/postgres-exporter-entrypoint.sh"]
+ volumes:
+ - ./infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh:/postgres-exporter-entrypoint.sh:ro
+ secrets:
+ - pg_username
+ - pg_password
+ - pg_dbname
+ networks:
+ - oullin_net
+ - caddy_net
+ depends_on:
+ api-db:
+ condition: service_healthy
+ expose:
+ - "9187"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9187/"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ deploy:
+ resources:
+ limits:
+ cpus: '0.25'
+ memory: 128M
+ reservations:
+ cpus: '0.05'
+ memory: 32M
+
+ grafana:
+ image: grafana/grafana:11.4.0
+ profiles: ["prod"]
+ container_name: oullin_grafana
+ restart: unless-stopped
+ ports:
+ - "127.0.0.1:3000:3000"
+ environment:
+ - GF_SERVER_ROOT_URL=http://localhost:3000
+ - GF_SECURITY_ADMIN_USER=admin
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD must be set in .env file}
+ - GF_USERS_ALLOW_SIGN_UP=false
+ - GF_AUTH_ANONYMOUS_ENABLED=false
+ - GF_INSTALL_PLUGINS=
+ - GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus:9090
+ volumes:
+ - grafana_data_prod:/var/lib/grafana
+ - ./infra/metrics/grafana/provisioning:/etc/grafana/provisioning:ro
+ - ./infra/metrics/grafana/dashboards:/var/lib/grafana/dashboards:ro
+ networks:
+ - caddy_net
+ depends_on:
+ prometheus:
+ condition: service_healthy
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 30s
+ deploy:
+ resources:
+ limits:
+ cpus: '0.5'
+ memory: 512M
+ reservations:
+ cpus: '0.1'
+ memory: 128M
+
+ grafana_local:
+ image: grafana/grafana:11.4.0
+ profiles: ["local"]
+ container_name: oullin_grafana_local
+ restart: unless-stopped
+ ports:
+ - "3000:3000"
+ environment:
+ - GF_SERVER_ROOT_URL=http://localhost:3000
+ - GF_SECURITY_ADMIN_USER=admin
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:?GRAFANA_ADMIN_PASSWORD must be set in .env file}
+ - GF_USERS_ALLOW_SIGN_UP=false
+ - GF_AUTH_ANONYMOUS_ENABLED=false
+ - GF_INSTALL_PLUGINS=
+ - GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus_local:9090
+ volumes:
+ - grafana_data_local:/var/lib/grafana
+ - ./infra/metrics/grafana/provisioning:/etc/grafana/provisioning:ro
+ - ./infra/metrics/grafana/dashboards:/var/lib/grafana/dashboards:ro
networks:
- caddy_net
+ depends_on:
+ prometheus_local:
+ condition: service_healthy
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 30s
+ deploy:
+ resources:
+ limits:
+ cpus: '0.5'
+ memory: 512M
+ reservations:
+ cpus: '0.1'
+ memory: 128M
# A dedicated service for running one-off Go commands
api-runner:
@@ -86,7 +337,7 @@ services:
- ./.env
build:
context: .
- dockerfile: ./docker/dockerfile-api
+ dockerfile: ./infra/docker/dockerfile-api
target: builder
volumes:
- .:/app
@@ -128,7 +379,7 @@ services:
ENV_HTTP_HOST: 0.0.0.0
build:
context: .
- dockerfile: ./docker/dockerfile-api
+ dockerfile: ./infra/docker/dockerfile-api
args:
- APP_VERSION=0.0.0.1
- APP_HOST_PORT=${ENV_HTTP_PORT}
diff --git a/go.mod b/go.mod
index 642b8e8b..837d0b83 100644
--- a/go.mod
+++ b/go.mod
@@ -13,6 +13,7 @@ require (
github.com/joho/godotenv v1.5.1
github.com/klauspost/compress v1.18.0
github.com/lib/pq v1.10.9
+ github.com/prometheus/client_golang v1.20.5
github.com/rs/cors v1.11.1
github.com/testcontainers/testcontainers-go v0.39.0
github.com/testcontainers/testcontainers-go/modules/postgres v0.39.0
@@ -29,7 +30,9 @@ require (
dario.cat/mergo v1.0.2 // indirect
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
+ github.com/beorn7/perks v1.0.1 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
+ github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/containerd/errdefs v1.0.0 // indirect
github.com/containerd/errdefs/pkg v0.3.0 // indirect
github.com/containerd/log v0.1.0 // indirect
@@ -66,11 +69,15 @@ require (
github.com/moby/sys/userns v0.1.0 // indirect
github.com/moby/term v0.5.2 // indirect
github.com/morikuni/aec v1.0.0 // indirect
+ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
+ github.com/prometheus/client_model v0.6.1 // indirect
+ github.com/prometheus/common v0.55.0 // indirect
+ github.com/prometheus/procfs v0.15.1 // indirect
github.com/shirou/gopsutil/v4 v4.25.9 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/stretchr/testify v1.11.1 // indirect
diff --git a/go.sum b/go.sum
index 81962bc9..f2d5a7aa 100644
--- a/go.sum
+++ b/go.sum
@@ -10,8 +10,12 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chai2010/webp v1.4.0 h1:6DA2pkkRUPnbOHvvsmGI3He1hBKf/bkRlniAiSGuEko=
github.com/chai2010/webp v1.4.0/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU=
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
@@ -92,6 +96,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
@@ -120,6 +126,8 @@ github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
@@ -132,6 +140,14 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
+github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
+github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
+github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
+github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
+github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
+github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
+github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
+github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/rs/cors v1.11.1 h1:eU3gRzXLRK57F5rKMGMZURNdIG4EoAmX8k94r9wXWHA=
@@ -185,8 +201,6 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
-golang.org/toolchain v0.0.1-go1.25.3.linux-amd64 h1:OsvRiFtt0A9JsTaoQsnFK4wKOOAY2UtJvkOT+Djl7tQ=
-golang.org/toolchain v0.0.1-go1.25.3.linux-amd64/go.mod h1:c/4eKWFBYMD/i1j7ipNwtrHQP02jj74611NzmDqwkJE=
golang.org/x/image v0.32.0 h1:6lZQWq75h7L5IWNk0r+SCpUJ6tUVd3v4ZHnbRKLkUDQ=
golang.org/x/image v0.32.0/go.mod h1:/R37rrQmKXtO6tYXAjtDLwQgFLHmhW+V6ayXlxzP2Pc=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
diff --git a/handler/metrics.go b/handler/metrics.go
new file mode 100644
index 00000000..9cbcbcdd
--- /dev/null
+++ b/handler/metrics.go
@@ -0,0 +1,23 @@
+package handler
+
+import (
+ "net/http"
+
+ "github.com/oullin/pkg/endpoint"
+ "github.com/prometheus/client_golang/prometheus/promhttp"
+)
+
+type MetricsHandler struct{}
+
+func NewMetricsHandler() MetricsHandler {
+ return MetricsHandler{}
+}
+
+// Handle returns the Prometheus metrics handler
+// Protected by Docker network isolation - only accessible from containers
+// within caddy_net and oullin_net networks (not exposed to host)
+func (h MetricsHandler) Handle(w http.ResponseWriter, r *http.Request) *endpoint.ApiError {
+ // Serve Prometheus metrics using the standard promhttp handler
+ promhttp.Handler().ServeHTTP(w, r)
+ return nil
+}
diff --git a/caddy/Caddyfile.local b/infra/caddy/Caddyfile.local
similarity index 74%
rename from caddy/Caddyfile.local
rename to infra/caddy/Caddyfile.local
index d1c84dbc..57e3819c 100644
--- a/caddy/Caddyfile.local
+++ b/infra/caddy/Caddyfile.local
@@ -2,6 +2,15 @@
# This is the most reliable way to ensure Caddy acts as a simple HTTP proxy locally.
{
auto_https off
+
+ # Enable metrics collection for HTTP handlers
+ servers {
+ metrics
+ }
+
+ # Admin API listens only on localhost within container for security
+ # Prometheus accesses /metrics via the dedicated :9180 listener, not the admin API
+ admin 127.0.0.1:2019
}
# It tells Caddy to listen on its internal port 80 for any incoming hostname.
@@ -35,8 +44,27 @@
respond 204
}
+ # Block protected paths
+ @protected path /metrics /generate-signature*
+ handle @protected {
+ respond 403
+ }
+
# Reverse proxy all incoming requests to the 'api' service.
# - The service name 'api' is resolved by Docker's internal DNS to the correct container IP on the 'caddy_net' network.
# - The API container listens on port 8080 (from the ENV_HTTP_PORT).
reverse_proxy api:8080
}
+
+# INTERNAL metrics endpoint for Prometheus scraping
+# This exposes ONLY /metrics, not the full admin API
+# Listens on all interfaces but not published to host (Docker network only)
+:9180 {
+ handle /metrics {
+ reverse_proxy localhost:2019
+ }
+
+ handle {
+ respond 404
+ }
+}
diff --git a/caddy/Caddyfile.prod b/infra/caddy/Caddyfile.prod
similarity index 82%
rename from caddy/Caddyfile.prod
rename to infra/caddy/Caddyfile.prod
index 16f46287..b71aca96 100644
--- a/caddy/Caddyfile.prod
+++ b/infra/caddy/Caddyfile.prod
@@ -1,3 +1,15 @@
+# Global options: Enable the admin API and metrics
+{
+ # Enable metrics collection for HTTP handlers
+ servers {
+ metrics
+ }
+
+ # Admin API listens only on localhost within container for security
+ # Prometheus accesses /metrics via the dedicated :9180 listener, not the admin API
+ admin 127.0.0.1:2019
+}
+
# Caddy will automatically provision a Let's Encrypt certificate.
gocanto.dev, www.gocanto.dev {
log {
@@ -29,8 +41,8 @@ oullin.io {
format json
}
- # --- Public listener: block protected path
- @protected_public path /api/generate-signature*
+ # --- Public listener: block protected paths
+ @protected_public path /api/generate-signature* /api/metrics
handle @protected_public {
respond 403
}
@@ -117,3 +129,16 @@ oullin.io {
respond 403
}
}
+
+# INTERNAL metrics endpoint for Prometheus scraping
+# This exposes ONLY /metrics, not the full admin API
+# Listens on all interfaces but not published to host (Docker network only)
+:9180 {
+ handle /metrics {
+ reverse_proxy localhost:2019
+ }
+
+ handle {
+ respond 404
+ }
+}
diff --git a/caddy/Dockerfile b/infra/caddy/Dockerfile
similarity index 93%
rename from caddy/Dockerfile
rename to infra/caddy/Dockerfile
index 6de850e3..69757ec0 100644
--- a/caddy/Dockerfile
+++ b/infra/caddy/Dockerfile
@@ -1,4 +1,4 @@
-# Filename: caddy/Dockerfile
+# Filename: infra/caddy/Dockerfile
# This Dockerfile builds a Caddy image using a specific, stable version number.
# Define a build argument for the Caddy version with a sensible default.
diff --git a/caddy/mtls/.gitkeep b/infra/caddy/mtls/.gitkeep
similarity index 100%
rename from caddy/mtls/.gitkeep
rename to infra/caddy/mtls/.gitkeep
diff --git a/caddy/readme.md b/infra/caddy/readme.md
similarity index 100%
rename from caddy/readme.md
rename to infra/caddy/readme.md
diff --git a/docker/dockerfile-api b/infra/docker/dockerfile-api
similarity index 100%
rename from docker/dockerfile-api
rename to infra/docker/dockerfile-api
diff --git a/metal/makefile/app.mk b/infra/makefile/app.mk
similarity index 66%
rename from metal/makefile/app.mk
rename to infra/makefile/app.mk
index b29af9bb..ad08b22a 100644
--- a/metal/makefile/app.mk
+++ b/infra/makefile/app.mk
@@ -1,12 +1,44 @@
-.PHONY: fresh destroy audit watch format run-cli test-all run-cli-docker run-metal
+# -------------------------------------------------------------------------------------------------------------------- #
+# Application Management Targets
+# -------------------------------------------------------------------------------------------------------------------- #
-DB_SECRET_USERNAME ?= ./database/infra/secrets/pg_username
-DB_SECRET_PASSWORD ?= ./database/infra/secrets/pg_password
-DB_SECRET_DBNAME ?= ./database/infra/secrets/pg_dbname
+# -------------------------------------------------------------------------------------------------------------------- #
+# Configuration Variables
+# -------------------------------------------------------------------------------------------------------------------- #
+
+ROOT_PATH := $(shell pwd)
+DB_SECRETS_DIR := $(ROOT_PATH)/database/infra/secrets
+
+DB_SECRET_USERNAME ?= $(DB_SECRETS_DIR)/pg_username
+DB_SECRET_PASSWORD ?= $(DB_SECRETS_DIR)/pg_password
+DB_SECRET_DBNAME ?= $(DB_SECRETS_DIR)/pg_dbname
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# PHONY Targets
+# -------------------------------------------------------------------------------------------------------------------- #
+
+.PHONY: fresh destroy audit watch format run-cli test-all run-cli-docker run-metal install-air
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Code Quality Commands
+# -------------------------------------------------------------------------------------------------------------------- #
format:
gofmt -w -s .
+audit:
+ $(call external_deps,'.')
+ $(call external_deps,'./app/...')
+ $(call external_deps,'./database/...')
+ $(call external_deps,'./docs/...')
+
+test-all:
+ go test ./...
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Docker Management Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
fresh:
docker compose down --volumes --rmi all --remove-orphans
docker ps
@@ -22,11 +54,9 @@ destroy:
docker ps -aq | xargs --no-run-if-empty docker rm && \
docker ps
-audit:
- $(call external_deps,'.')
- $(call external_deps,'./app/...')
- $(call external_deps,'./database/...')
- $(call external_deps,'./docs/...')
+# -------------------------------------------------------------------------------------------------------------------- #
+# Development Tools
+# -------------------------------------------------------------------------------------------------------------------- #
watch:
# --- Works with (air).
@@ -39,6 +69,10 @@ install-air:
@echo "Installing air ..."
@go install github.com/air-verse/air@latest
+# -------------------------------------------------------------------------------------------------------------------- #
+# CLI Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
run-cli:
@missing_values=""; \
missing_files=""; \
@@ -115,11 +149,9 @@ run-cli:
printf "\n$(RED)❌ CLI exited with status $$status.$(NC)\n"; \
exit $$status; \
fi
+
run-cli-docker:
make run-cli DB_SECRET_USERNAME=$(DB_SECRET_USERNAME) DB_SECRET_PASSWORD=$(DB_SECRET_PASSWORD) DB_SECRET_DBNAME=$(DB_SECRET_DBNAME)
-test-all:
- go test ./...
-
run-metal:
go run metal/cli/main.go
diff --git a/metal/makefile/build.mk b/infra/makefile/build.mk
similarity index 100%
rename from metal/makefile/build.mk
rename to infra/makefile/build.mk
diff --git a/metal/makefile/caddy.mk b/infra/makefile/caddy.mk
similarity index 91%
rename from metal/makefile/caddy.mk
rename to infra/makefile/caddy.mk
index c2f6e748..8e71e1c0 100644
--- a/metal/makefile/caddy.mk
+++ b/infra/makefile/caddy.mk
@@ -1,8 +1,8 @@
.PHONY: caddy-gen-certs caddy-del-certs caddy-validate caddy-fresh caddy-restart
-CADDY_MTLS_DIR = $(ROOT_PATH)/caddy/mtls
-APP_CADDY_CONFIG_PROD_FILE ?= caddy/Caddyfile.prod
-APP_CADDY_CONFIG_LOCAL_FILE ?= caddy/Caddyfile.local
+CADDY_MTLS_DIR = $(ROOT_PATH)/infra/caddy/mtls
+APP_CADDY_CONFIG_PROD_FILE ?= infra/caddy/Caddyfile.prod
+APP_CADDY_CONFIG_LOCAL_FILE ?= infra/caddy/Caddyfile.local
caddy-restart:
docker compose up -d --force-recreate caddy_prod
@@ -66,6 +66,6 @@ caddy-del-certs:
caddy-validate:
@docker run --rm \
- -v "$(ROOT_PATH)/caddy/Caddyfile.prod:/etc/caddy/Caddyfile:ro" \
- -v "$(ROOT_PATH)/caddy/mtls:/etc/caddy/mtls:ro" \
+ -v "$(ROOT_PATH)/infra/caddy/Caddyfile.prod:/etc/caddy/Caddyfile:ro" \
+ -v "$(ROOT_PATH)/infra/caddy/mtls:/etc/caddy/mtls:ro" \
caddy:2.10.0 caddy validate --config /etc/caddy/Caddyfile
diff --git a/metal/makefile/db.mk b/infra/makefile/db.mk
similarity index 100%
rename from metal/makefile/db.mk
rename to infra/makefile/db.mk
diff --git a/metal/makefile/env.mk b/infra/makefile/env.mk
similarity index 100%
rename from metal/makefile/env.mk
rename to infra/makefile/env.mk
diff --git a/metal/makefile/helpers.mk b/infra/makefile/helpers.mk
similarity index 100%
rename from metal/makefile/helpers.mk
rename to infra/makefile/helpers.mk
diff --git a/metal/makefile/infra.mk b/infra/makefile/infra.mk
similarity index 100%
rename from metal/makefile/infra.mk
rename to infra/makefile/infra.mk
diff --git a/metal/makefile/logs.mk b/infra/makefile/logs.mk
similarity index 100%
rename from metal/makefile/logs.mk
rename to infra/makefile/logs.mk
diff --git a/infra/makefile/monitor.mk b/infra/makefile/monitor.mk
new file mode 100644
index 00000000..2288b911
--- /dev/null
+++ b/infra/makefile/monitor.mk
@@ -0,0 +1,556 @@
+# -------------------------------------------------------------------------------------------------------------------- #
+# Monitoring Stack Targets
+# -------------------------------------------------------------------------------------------------------------------- #
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Configuration Variables
+# -------------------------------------------------------------------------------------------------------------------- #
+
+ROOT_PATH := $(shell pwd)
+MONITORING_DIR := $(ROOT_PATH)/infra/metrics
+BACKUPS_DIR := $(ROOT_PATH)/storage/monitoring/backups
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Volume Labels (defined in docker-compose.yml)
+# -------------------------------------------------------------------------------------------------------------------- #
+
+PROMETHEUS_VOLUME_LOCAL := prometheus_data_local
+PROMETHEUS_VOLUME_PROD := prometheus_data_prod
+GRAFANA_VOLUME_LOCAL := grafana_data_local
+GRAFANA_VOLUME_PROD := grafana_data_prod
+
+# Docker service names (defined in docker-compose.yml)
+PROMETHEUS_SERVICE_LOCAL := prometheus_local
+PROMETHEUS_SERVICE_PROD := prometheus
+GRAFANA_SERVICE_LOCAL := grafana_local
+GRAFANA_SERVICE_PROD := grafana
+POSTGRES_EXPORTER_SERVICE_LOCAL := postgres_exporter_local
+POSTGRES_EXPORTER_SERVICE_PROD := postgres_exporter
+
+# Monitoring service URLs and ports
+GRAFANA_HOST := localhost
+GRAFANA_PORT := 3000
+GRAFANA_URL := http://$(GRAFANA_HOST):$(GRAFANA_PORT)
+
+PROMETHEUS_HOST := localhost
+PROMETHEUS_PORT := 9090
+PROMETHEUS_URL := http://$(PROMETHEUS_HOST):$(PROMETHEUS_PORT)
+
+CADDY_ADMIN_HOST := localhost
+CADDY_ADMIN_PORT := 2019
+CADDY_ADMIN_URL := http://$(CADDY_ADMIN_HOST):$(CADDY_ADMIN_PORT)
+
+API_HOST := localhost
+API_PORT := 18080
+API_URL := http://$(API_HOST):$(API_PORT)
+PING_USERNAME ?= $(ENV_PING_USERNAME)
+PING_PASSWORD ?= $(ENV_PING_PASSWORD)
+PING_AUTH_FLAG := $(if $(and $(PING_USERNAME),$(PING_PASSWORD)),-u $(PING_USERNAME):$(PING_PASSWORD),)
+
+# Production API endpoint (behind Caddy)
+API_PROD_HOST := localhost
+API_PROD_URL := http://$(API_PROD_HOST)
+
+# Internal service URLs (Docker network)
+PG_EXPORTER_HOST := postgres_exporter_local
+PG_EXPORTER_PORT := 9187
+PG_EXPORTER_URL := http://$(PG_EXPORTER_HOST):$(PG_EXPORTER_PORT)
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# PHONY Targets
+# -------------------------------------------------------------------------------------------------------------------- #
+
+.PHONY: monitor-up monitor-up-prod monitor-down monitor-down-prod monitor-restart monitor-restart-prod \
+ monitor-up-full monitor-up-full-prod monitor-up-logs monitor-up-logs-prod monitor-down-remove monitor-down-remove-prod \
+ monitor-pull monitor-pull-prod monitor-docker-config monitor-docker-config-prod monitor-docker-exec-prometheus monitor-docker-exec-prometheus-prod \
+ monitor-docker-exec-grafana monitor-docker-exec-grafana-prod monitor-docker-ps monitor-docker-inspect monitor-docker-inspect-prod \
+ monitor-docker-logs-prometheus monitor-docker-logs-prometheus-prod monitor-docker-logs-grafana monitor-docker-logs-grafana-prod monitor-docker-logs-db monitor-docker-logs-db-prod \
+ monitor-status monitor-logs monitor-logs-prod \
+ monitor-test monitor-targets monitor-config monitor-config-prod monitor-grafana monitor-prometheus \
+ monitor-caddy-metrics monitor-api-metrics monitor-db-metrics monitor-db-metrics-prod monitor-metrics \
+ monitor-traffic monitor-traffic-heavy monitor-traffic-prod monitor-traffic-heavy-prod \
+ monitor-clean monitor-clean-prod monitor-stats monitor-stats-prod monitor-backup monitor-backup-prod monitor-export-dashboards monitor-help \
+ monitor-volumes-local-check monitor-volumes-prod-check
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Start/Stop Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Start monitoring stack (local development)
+monitor-up:
+ @printf "$(BOLD)$(CYAN)Starting monitoring stack (local)...$(NC)\n"
+ @docker compose --profile local up -d $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+ @sleep 3
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack started$(NC)\n"
+ @printf "\n$(BOLD)Access points:$(NC)\n"
+ @printf " $(GREEN)Grafana:$(NC) $(GRAFANA_URL)\n"
+ @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)\n"
+ @printf " $(GREEN)Caddy Admin:$(NC) $(CADDY_ADMIN_URL)\n\n"
+
+## Start monitoring stack (production)
+monitor-up-prod:
+ @printf "$(BOLD)$(CYAN)Starting monitoring stack (production)...$(NC)\n"
+ @docker compose --profile prod up -d $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+ @sleep 3
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack started$(NC)\n"
+ @printf "\n$(BOLD)Access points (from server):$(NC)\n"
+ @printf " $(GREEN)Grafana:$(NC) $(GRAFANA_URL)\n"
+ @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)\n"
+ @printf " $(GREEN)Caddy Admin:$(NC) $(CADDY_ADMIN_URL)\n\n"
+
+## Stop monitoring stack (local)
+monitor-down:
+ @printf "$(BOLD)$(CYAN)Stopping monitoring stack (local)...$(NC)\n"
+ @docker compose --profile local stop $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack stopped$(NC)\n\n"
+
+## Stop monitoring stack (production)
+monitor-down-prod:
+ @printf "$(BOLD)$(CYAN)Stopping monitoring stack (production)...$(NC)\n"
+ @docker compose --profile prod stop $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack stopped$(NC)\n\n"
+
+## Restart monitoring stack (local)
+monitor-restart:
+ @printf "$(BOLD)$(CYAN)Restarting monitoring stack (local)...$(NC)\n"
+ @docker compose --profile local restart $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack restarted$(NC)\n\n"
+
+## Restart monitoring stack (production)
+monitor-restart-prod:
+ @printf "$(BOLD)$(CYAN)Restarting monitoring stack (production)...$(NC)\n"
+ @docker compose --profile prod restart $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+ @printf "$(BOLD)$(GREEN)✓ Monitoring stack restarted$(NC)\n\n"
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Docker Compose Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Start monitoring with full stack (API + DB + monitoring) - local
+monitor-up-full:
+ @printf "$(BOLD)$(CYAN)Starting full stack with monitoring (local)...$(NC)\n"
+ @docker compose --profile local up -d
+ @sleep 3
+ @printf "$(BOLD)$(GREEN)✓ Full stack started$(NC)\n\n"
+
+## Start monitoring with full stack (API + DB + monitoring) - production
+monitor-up-full-prod:
+ @printf "$(BOLD)$(CYAN)Starting full stack with monitoring (production)...$(NC)\n"
+ @docker compose --profile prod up -d
+ @sleep 3
+ @printf "$(BOLD)$(GREEN)✓ Full stack started$(NC)\n\n"
+
+## Start monitoring stack with logs (foreground) - local
+monitor-up-logs:
+ @printf "$(BOLD)$(CYAN)Starting monitoring stack with logs (local)...$(NC)\n"
+ @docker compose --profile local up $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+
+## Start monitoring stack with logs (foreground) - production
+monitor-up-logs-prod:
+ @printf "$(BOLD)$(CYAN)Starting monitoring stack with logs (production)...$(NC)\n"
+ @docker compose --profile prod up $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+
+## Stop and remove monitoring containers - local
+monitor-down-remove:
+ @printf "$(BOLD)$(CYAN)Stopping and removing monitoring containers (local)...$(NC)\n"
+ @docker compose --profile local down $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+ @printf "$(BOLD)$(GREEN)✓ Containers stopped and removed$(NC)\n\n"
+
+## Stop and remove monitoring containers - production
+monitor-down-remove-prod:
+ @printf "$(BOLD)$(CYAN)Stopping and removing monitoring containers (production)...$(NC)\n"
+ @docker compose --profile prod down $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+ @printf "$(BOLD)$(GREEN)✓ Containers stopped and removed$(NC)\n\n"
+
+## Pull latest monitoring images (local)
+monitor-pull:
+ @printf "$(BOLD)$(CYAN)Pulling latest monitoring images (local)...$(NC)\n"
+ @docker compose pull $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+ @printf "$(BOLD)$(GREEN)✓ Images pulled$(NC)\n\n"
+
+## Pull latest monitoring images (production)
+monitor-pull-prod:
+ @printf "$(BOLD)$(CYAN)Pulling latest monitoring images (production)...$(NC)\n"
+ @docker compose pull $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+ @printf "$(BOLD)$(GREEN)✓ Images pulled$(NC)\n\n"
+
+## Show docker compose config for monitoring services (local)
+monitor-docker-config:
+ @printf "$(BOLD)$(CYAN)Docker Compose Configuration (monitoring - local)$(NC)\n\n"
+ @docker compose config --profile local | grep -A 20 "$(PROMETHEUS_SERVICE_LOCAL)\|$(GRAFANA_SERVICE_LOCAL)\|$(POSTGRES_EXPORTER_SERVICE_LOCAL)" || docker compose config --profile local
+
+## Show docker compose config for monitoring services (production)
+monitor-docker-config-prod:
+ @printf "$(BOLD)$(CYAN)Docker Compose Configuration (monitoring - production)$(NC)\n\n"
+ @docker compose config --profile prod | grep -A 20 "$(PROMETHEUS_SERVICE_PROD)\|$(GRAFANA_SERVICE_PROD)\|$(POSTGRES_EXPORTER_SERVICE_PROD)" || docker compose config --profile prod
+
+## Execute command in Prometheus container (local)
+monitor-docker-exec-prometheus:
+ @printf "$(BOLD)$(CYAN)Executing shell in Prometheus container (local)...$(NC)\n"
+ @docker exec -it oullin_prometheus_local /bin/sh
+
+## Execute command in Prometheus container (production)
+monitor-docker-exec-prometheus-prod:
+ @printf "$(BOLD)$(CYAN)Executing shell in Prometheus container (production)...$(NC)\n"
+ @docker exec -it oullin_prometheus /bin/sh
+
+## Execute command in Grafana container (local)
+monitor-docker-exec-grafana:
+ @printf "$(BOLD)$(CYAN)Executing shell in Grafana container (local)...$(NC)\n"
+ @docker exec -it oullin_grafana_local /bin/sh
+
+## Execute command in Grafana container (production)
+monitor-docker-exec-grafana-prod:
+ @printf "$(BOLD)$(CYAN)Executing shell in Grafana container (production)...$(NC)\n"
+ @docker exec -it oullin_grafana /bin/sh
+
+## Show docker ps for monitoring containers
+monitor-docker-ps:
+ @printf "$(BOLD)$(CYAN)Monitoring Containers$(NC)\n\n"
+ @docker ps --filter "name=prometheus" --filter "name=grafana" --filter "name=exporter" --format "table {{.ID}}\t{{.Names}}\t{{.Status}}\t{{.Ports}}"
+ @printf "\n"
+
+## Show docker inspect for monitoring containers (local)
+monitor-docker-inspect:
+ @printf "$(BOLD)$(CYAN)Inspecting Monitoring Containers (local)$(NC)\n\n"
+ @docker inspect oullin_prometheus_local oullin_grafana_local oullin_postgres_exporter_local 2>/dev/null | jq '.[].Name, .[].State, .[].NetworkSettings.Networks' || echo "$(RED)Containers not running$(NC)"
+
+## Show docker inspect for monitoring containers (production)
+monitor-docker-inspect-prod:
+ @printf "$(BOLD)$(CYAN)Inspecting Monitoring Containers (production)$(NC)\n\n"
+ @docker inspect oullin_prometheus oullin_grafana oullin_postgres_exporter 2>/dev/null | jq '.[].Name, .[].State, .[].NetworkSettings.Networks' || echo "$(RED)Containers not running$(NC)"
+
+## View monitoring container logs (docker logs - local)
+monitor-docker-logs-prometheus:
+ @docker logs -f oullin_prometheus_local
+
+monitor-docker-logs-grafana:
+ @docker logs -f oullin_grafana_local
+
+monitor-docker-logs-db:
+ @docker logs -f oullin_postgres_exporter_local
+
+## View monitoring container logs (docker logs - production)
+monitor-docker-logs-prometheus-prod:
+ @docker logs -f oullin_prometheus
+
+monitor-docker-logs-grafana-prod:
+ @docker logs -f oullin_grafana
+
+monitor-docker-logs-db-prod:
+ @docker logs -f oullin_postgres_exporter
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Status & Information Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Show status of monitoring services
+monitor-status:
+ @printf "$(BOLD)$(CYAN)Monitoring Stack Status$(NC)\n\n"
+ @docker ps --filter "name=prometheus" --filter "name=grafana" --filter "name=exporter" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
+ @printf "\n"
+
+## Show logs from all monitoring services (local)
+monitor-logs:
+ @printf "$(BOLD)$(CYAN)Monitoring Stack Logs (local)$(NC)\n\n"
+ @docker compose logs -f $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL) $(POSTGRES_EXPORTER_SERVICE_LOCAL)
+
+## Show logs from all monitoring services (production)
+monitor-logs-prod:
+ @printf "$(BOLD)$(CYAN)Monitoring Stack Logs (production)$(NC)\n\n"
+ @docker compose logs -f $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD) $(POSTGRES_EXPORTER_SERVICE_PROD)
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Testing & Verification Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Run full monitoring stack test suite (local profile only)
+monitor-test:
+ @printf "$(BOLD)$(CYAN)Running monitoring stack tests (local profile)...$(NC)\n"
+ @printf "$(YELLOW)Note: This target is for local development only.$(NC)\n"
+ @printf "$(YELLOW)For production, verify monitoring from the server directly.$(NC)\n\n"
+ @printf "$(BOLD)1. Checking services are running...$(NC)\n"
+ @docker ps --filter "name=$(PROMETHEUS_SERVICE_LOCAL)" --filter "name=$(GRAFANA_SERVICE_LOCAL)" --filter "name=$(POSTGRES_EXPORTER_SERVICE_LOCAL)" --format " ✓ {{.Names}}: {{.Status}}" || echo " $(RED)✗ Services not running$(NC)"
+ @printf "\n$(BOLD)2. Testing Prometheus targets...$(NC)\n"
+ @curl -s $(PROMETHEUS_URL)/api/v1/targets | grep -q '"health":"up"' && echo " $(GREEN)✓ Prometheus targets are UP$(NC)" || echo " $(RED)✗ Some targets are DOWN$(NC)"
+ @printf "\n$(BOLD)3. Testing Caddy metrics endpoint...$(NC)\n"
+ @curl -s $(CADDY_ADMIN_URL)/metrics | grep -q "caddy_http_requests_total" && echo " $(GREEN)✓ Caddy metrics accessible$(NC)" || echo " $(RED)✗ Caddy metrics unavailable$(NC)"
+ @printf "\n$(BOLD)4. Testing API metrics endpoint...$(NC)\n"
+ @curl -s $(API_URL)/metrics | grep -q "go_goroutines" && echo " $(GREEN)✓ API metrics accessible$(NC)" || echo " $(RED)✗ API metrics unavailable$(NC)"
+ @printf "\n$(BOLD)5. Testing Grafana...$(NC)\n"
+ @curl -s $(GRAFANA_URL)/api/health | grep -q "ok" && echo " $(GREEN)✓ Grafana is healthy$(NC)" || echo " $(RED)✗ Grafana is unhealthy$(NC)"
+ @printf "\n$(BOLD)$(GREEN)Test suite completed!$(NC)\n\n"
+
+## Verify Prometheus targets status
+monitor-targets:
+ @printf "$(BOLD)$(CYAN)Prometheus Targets Status$(NC)\n\n"
+ @curl -s $(PROMETHEUS_URL)/api/v1/targets | jq -r '.data.activeTargets[] | "[\(.health | ascii_upcase)] \(.labels.job) - \(.scrapeUrl)"' || echo "$(RED)Failed to fetch targets. Is Prometheus running?$(NC)"
+ @printf "\n"
+
+## Check Prometheus configuration (local)
+monitor-config:
+ @printf "$(BOLD)$(CYAN)Prometheus Configuration (local)$(NC)\n\n"
+ @docker exec oullin_prometheus_local cat /etc/prometheus/prometheus.yml
+
+## Check Prometheus configuration (production)
+monitor-config-prod:
+ @printf "$(BOLD)$(CYAN)Prometheus Configuration (production)$(NC)\n\n"
+ @docker exec oullin_prometheus cat /etc/prometheus/prometheus.yml
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Metrics Access Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Open Grafana in browser
+monitor-grafana:
+ @printf "$(BOLD)$(CYAN)Opening Grafana...$(NC)\n"
+ @printf "URL: $(GREEN)$(GRAFANA_URL)$(NC)\n"
+ @printf "Credentials: admin / (set via GRAFANA_ADMIN_PASSWORD)\n\n"
+ @which xdg-open > /dev/null && xdg-open $(GRAFANA_URL) || which open > /dev/null && open $(GRAFANA_URL) || echo "Please open $(GRAFANA_URL) in your browser"
+
+## Open Prometheus in browser
+monitor-prometheus:
+ @printf "$(BOLD)$(CYAN)Opening Prometheus...$(NC)\n"
+ @printf "URL: $(GREEN)$(PROMETHEUS_URL)$(NC)\n\n"
+ @which xdg-open > /dev/null && xdg-open $(PROMETHEUS_URL) || which open > /dev/null && open $(PROMETHEUS_URL) || echo "Please open $(PROMETHEUS_URL) in your browser"
+
+## Show Caddy metrics
+monitor-caddy-metrics:
+ @printf "$(BOLD)$(CYAN)Caddy Metrics$(NC)\n\n"
+ @curl -s $(CADDY_ADMIN_URL)/metrics | grep "^caddy_" | head -20
+ @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n"
+ @printf "Full metrics: $(GREEN)$(CADDY_ADMIN_URL)/metrics$(NC)\n\n"
+
+## Show API metrics
+monitor-api-metrics:
+ @printf "$(BOLD)$(CYAN)API Metrics$(NC)\n\n"
+ @curl -s $(API_URL)/metrics | grep "^go_" | head -20
+ @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n"
+ @printf "Full metrics: $(GREEN)$(API_URL)/metrics$(NC)\n\n"
+
+## Show PostgreSQL metrics (local)
+monitor-db-metrics:
+ @printf "$(BOLD)$(CYAN)PostgreSQL Metrics (local)$(NC)\n\n"
+ @docker exec oullin_prometheus_local curl -s $(PG_EXPORTER_URL)/metrics | grep "^pg_" | head -20
+ @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n\n"
+
+## Show PostgreSQL metrics (production)
+monitor-db-metrics-prod:
+ @printf "$(BOLD)$(CYAN)PostgreSQL Metrics (production)$(NC)\n\n"
+ @docker exec oullin_prometheus curl -s http://postgres_exporter:9187/metrics | grep "^pg_" | head -20
+ @printf "\n$(YELLOW)... (showing first 20 metrics)$(NC)\n\n"
+
+## Show all metrics endpoints
+monitor-metrics:
+ @printf "$(BOLD)$(CYAN)Available Metrics Endpoints$(NC)\n\n"
+ @printf " $(GREEN)Caddy:$(NC) $(CADDY_ADMIN_URL)/metrics\n"
+ @printf " $(GREEN)API:$(NC) $(API_URL)/metrics\n"
+ @printf " $(GREEN)PostgreSQL:$(NC) $(PG_EXPORTER_URL)/metrics (internal)\n"
+ @printf " $(GREEN)Prometheus:$(NC) $(PROMETHEUS_URL)/metrics\n\n"
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Traffic Generation & Testing
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Generate test traffic to populate metrics (local profile)
+monitor-traffic:
+ @if [ -z "$(PING_USERNAME)" ] || [ -z "$(PING_PASSWORD)" ]; then \
+ printf "$(RED)Missing ping credentials. Export ENV_PING_USERNAME/ENV_PING_PASSWORD or pass PING_USERNAME/PING_PASSWORD to make.$(NC)\n"; \
+ exit 1; \
+ fi
+ @printf "$(BOLD)$(CYAN)Generating test traffic (local)...$(NC)\n"
+ @printf "Making 100 requests to /ping endpoint...\n"
+ @for i in $$(seq 1 100); do \
+ curl -s $(PING_AUTH_FLAG) $(API_URL)/ping > /dev/null && printf "." || printf "$(RED)✗$(NC)"; \
+ sleep 0.1; \
+ done
+ @printf "\n$(BOLD)$(GREEN)✓ Test traffic generated$(NC)\n"
+ @printf "\nCheck dashboards at: $(GREEN)$(GRAFANA_URL)$(NC)\n\n"
+
+## Generate heavy test traffic (local profile)
+monitor-traffic-heavy:
+ @printf "$(BOLD)$(CYAN)Generating heavy test traffic (local)...$(NC)\n"
+ @printf "Making 500 requests with 5 concurrent connections...\n"
+ @for i in $$(seq 1 100); do \
+ (for j in $$(seq 1 5); do curl -s $(API_URL)/ping > /dev/null & done; wait); \
+ printf "."; \
+ sleep 0.05; \
+ done
+ @printf "\n$(BOLD)$(GREEN)✓ Heavy test traffic generated$(NC)\n\n"
+
+## Generate test traffic to populate metrics (production profile)
+monitor-traffic-prod:
+ @printf "$(BOLD)$(CYAN)Generating test traffic (production)...$(NC)\n"
+ @printf "Making 100 requests to /api/ping endpoint...\n"
+ @for i in $$(seq 1 100); do \
+ curl -s $(API_PROD_URL)/api/ping > /dev/null && printf "." || printf "$(RED)✗$(NC)"; \
+ sleep 0.1; \
+ done
+ @printf "\n$(BOLD)$(GREEN)✓ Test traffic generated$(NC)\n"
+ @printf "\n$(YELLOW)Note: Run this from the production server$(NC)\n"
+ @printf "SSH tunnel for Grafana: $(GREEN)ssh -L 3000:localhost:3000 user@server$(NC)\n\n"
+
+## Generate heavy test traffic (production profile)
+monitor-traffic-heavy-prod:
+ @printf "$(BOLD)$(CYAN)Generating heavy test traffic (production)...$(NC)\n"
+ @printf "Making 500 requests with 5 concurrent connections...\n"
+ @for i in $$(seq 1 100); do \
+ (for j in $$(seq 1 5); do curl -s $(API_PROD_URL)/api/ping > /dev/null & done; wait); \
+ printf "."; \
+ sleep 0.05; \
+ done
+ @printf "\n$(BOLD)$(GREEN)✓ Heavy test traffic generated$(NC)\n"
+ @printf "\n$(YELLOW)Note: Run this from the production server$(NC)\n\n"
+
+# -------------------------------------------------------------------------------------------------------------------- #
+# Utility Commands
+# -------------------------------------------------------------------------------------------------------------------- #
+
+## Clean monitoring data (removes all metrics/dashboard data) - local
+monitor-clean: monitor-volumes-local-check
+ @printf "$(BOLD)$(RED)WARNING: This will delete all monitoring data (local)!$(NC)\n"
+ @printf "Press Ctrl+C to cancel, or Enter to continue..."
+ @read
+ @printf "$(BOLD)$(CYAN)Stopping monitoring stack...$(NC)\n"
+ @docker compose --profile local down $(PROMETHEUS_SERVICE_LOCAL) $(GRAFANA_SERVICE_LOCAL)
+ @printf "$(BOLD)$(CYAN)Removing volumes...$(NC)\n"
+ @docker volume rm -f $(PROMETHEUS_VOLUME_LOCAL) $(GRAFANA_VOLUME_LOCAL) || true
+ @printf "$(BOLD)$(GREEN)✓ Monitoring data cleaned$(NC)\n\n"
+
+## Clean monitoring data (removes all metrics/dashboard data) - production
+monitor-clean-prod: monitor-volumes-prod-check
+ @printf "$(BOLD)$(RED)WARNING: This will delete all monitoring data (production)!$(NC)\n"
+ @printf "Press Ctrl+C to cancel, or Enter to continue..."
+ @read
+ @printf "$(BOLD)$(CYAN)Stopping monitoring stack...$(NC)\n"
+ @docker compose --profile prod down $(PROMETHEUS_SERVICE_PROD) $(GRAFANA_SERVICE_PROD)
+ @printf "$(BOLD)$(CYAN)Removing volumes...$(NC)\n"
+ @docker volume rm -f $(PROMETHEUS_VOLUME_PROD) $(GRAFANA_VOLUME_PROD) || true
+ @printf "$(BOLD)$(GREEN)✓ Monitoring data cleaned$(NC)\n\n"
+
+## Show monitoring stack resource usage (local)
+monitor-stats:
+ @printf "$(BOLD)$(CYAN)Monitoring Stack Resource Usage (local)$(NC)\n\n"
+ @docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \
+ oullin_prometheus_local oullin_grafana_local oullin_postgres_exporter_local 2>/dev/null || \
+ echo "$(RED)No monitoring containers running$(NC)"
+ @printf "\n"
+
+## Show monitoring stack resource usage (production)
+monitor-stats-prod:
+ @printf "$(BOLD)$(CYAN)Monitoring Stack Resource Usage (production)$(NC)\n\n"
+ @docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" \
+ oullin_prometheus oullin_grafana oullin_postgres_exporter 2>/dev/null || \
+ echo "$(RED)No monitoring containers running$(NC)"
+ @printf "\n"
+
+## Backup Prometheus data (with automatic rotation) - local
+monitor-backup: monitor-volumes-local-check
+ @printf "$(BOLD)$(CYAN)Backing up Prometheus data (local)...$(NC)\n"
+ @mkdir -p $(BACKUPS_DIR)
+ @docker run --rm -v $(PROMETHEUS_VOLUME_LOCAL):/data -v $(BACKUPS_DIR):/backup alpine \
+ tar czf /backup/prometheus-backup-$$(date +%Y%m%d-%H%M%S).tar.gz /data
+ @printf "$(BOLD)$(GREEN)✓ Backup created in $(BACKUPS_DIR)/$(NC)\n"
+ @printf "$(YELLOW)Rotating backups (keeping last 5)...$(NC)\n"
+ @for f in $$(ls -t $(BACKUPS_DIR)/prometheus-backup-*.tar.gz 2>/dev/null | tail -n +6); do rm -f "$$f"; done || true
+ @BACKUP_COUNT=$$(ls -1 $(BACKUPS_DIR)/prometheus-backup-*.tar.gz 2>/dev/null | wc -l); \
+ printf "$(BOLD)$(GREEN)✓ Backup rotation complete ($${BACKUP_COUNT} backups kept)$(NC)\n\n"
+
+monitor-volumes-local-check:
+ @[ -n "$(PROMETHEUS_VOLUME_LOCAL)" ] && [ -n "$(GRAFANA_VOLUME_LOCAL)" ] || \
+ { printf "$(RED)Unable to resolve monitoring volumes from docker compose config (local profile).$(NC)\n"; exit 1; }
+
+## Backup Prometheus data (with automatic rotation) - production
+monitor-backup-prod: monitor-volumes-prod-check
+ @printf "$(BOLD)$(CYAN)Backing up Prometheus data (production)...$(NC)\n"
+ @mkdir -p $(BACKUPS_DIR)
+ @docker run --rm -v $(PROMETHEUS_VOLUME_PROD):/data -v $(BACKUPS_DIR):/backup alpine \
+ tar czf /backup/prometheus-prod-backup-$$(date +%Y%m%d-%H%M%S).tar.gz /data
+ @printf "$(BOLD)$(GREEN)✓ Backup created in $(BACKUPS_DIR)/$(NC)\n"
+ @printf "$(YELLOW)Rotating backups (keeping last 5)...$(NC)\n"
+ @for f in $$(ls -t $(BACKUPS_DIR)/prometheus-prod-backup-*.tar.gz 2>/dev/null | tail -n +6); do rm -f "$$f"; done || true
+ @BACKUP_COUNT=$$(ls -1 $(BACKUPS_DIR)/prometheus-prod-backup-*.tar.gz 2>/dev/null | wc -l); \
+ printf "$(BOLD)$(GREEN)✓ Backup rotation complete ($${BACKUP_COUNT} backups kept)$(NC)\n\n"
+
+monitor-volumes-prod-check:
+ @[ -n "$(PROMETHEUS_VOLUME_PROD)" ] && [ -n "$(GRAFANA_VOLUME_PROD)" ] || \
+ { printf "$(RED)Unable to resolve monitoring volumes from docker compose config (production profile).$(NC)\n"; exit 1; }
+
+## Export Grafana dashboards to JSON files
+monitor-export-dashboards:
+ @printf "$(BOLD)$(CYAN)Exporting Grafana dashboards...$(NC)\n"
+ @$(MONITORING_DIR)/grafana/scripts/export-dashboards.sh
+
+## Show monitoring help
+monitor-help:
+ @printf "\n$(BOLD)$(CYAN)Monitoring Stack Commands$(NC)\n\n"
+ @printf "$(BOLD)$(BLUE)Start/Stop:$(NC)\n"
+ @printf " $(GREEN)monitor-up$(NC) - Start monitoring stack (local)\n"
+ @printf " $(GREEN)monitor-up-prod$(NC) - Start monitoring stack (production)\n"
+ @printf " $(GREEN)monitor-up-full$(NC) - Start full stack with monitoring (local)\n"
+ @printf " $(GREEN)monitor-up-full-prod$(NC) - Start full stack with monitoring (prod)\n"
+ @printf " $(GREEN)monitor-up-logs$(NC) - Start with logs in foreground (local)\n"
+ @printf " $(GREEN)monitor-up-logs-prod$(NC) - Start with logs in foreground (prod)\n"
+ @printf " $(GREEN)monitor-down$(NC) - Stop monitoring stack (local)\n"
+ @printf " $(GREEN)monitor-down-prod$(NC) - Stop monitoring stack (production)\n"
+ @printf " $(GREEN)monitor-down-remove$(NC) - Stop and remove containers (local)\n"
+ @printf " $(GREEN)monitor-down-remove-prod$(NC) - Stop and remove containers (prod)\n"
+ @printf " $(GREEN)monitor-restart$(NC) - Restart monitoring stack (local)\n"
+ @printf " $(GREEN)monitor-restart-prod$(NC) - Restart monitoring stack (prod)\n\n"
+ @printf "$(BOLD)$(BLUE)Docker Commands:$(NC)\n"
+ @printf " $(GREEN)monitor-docker-ps$(NC) - Show running monitoring containers\n"
+ @printf " $(GREEN)monitor-docker-config$(NC) - Show docker compose config (local)\n"
+ @printf " $(GREEN)monitor-docker-config-prod$(NC) - Show docker compose config (prod)\n"
+ @printf " $(GREEN)monitor-docker-inspect$(NC) - Inspect monitoring containers (local)\n"
+ @printf " $(GREEN)monitor-docker-inspect-prod$(NC) - Inspect monitoring containers (prod)\n"
+ @printf " $(GREEN)monitor-docker-exec-prometheus$(NC) - Shell into Prometheus container (local)\n"
+ @printf " $(GREEN)monitor-docker-exec-prometheus-prod$(NC)- Shell into Prometheus container (prod)\n"
+ @printf " $(GREEN)monitor-docker-exec-grafana$(NC) - Shell into Grafana container (local)\n"
+ @printf " $(GREEN)monitor-docker-exec-grafana-prod$(NC) - Shell into Grafana container (prod)\n"
+ @printf " $(GREEN)monitor-docker-logs-prometheus$(NC) - Docker logs for Prometheus (local)\n"
+ @printf " $(GREEN)monitor-docker-logs-prometheus-prod$(NC)- Docker logs for Prometheus (prod)\n"
+ @printf " $(GREEN)monitor-docker-logs-grafana$(NC) - Docker logs for Grafana (local)\n"
+ @printf " $(GREEN)monitor-docker-logs-grafana-prod$(NC) - Docker logs for Grafana (prod)\n"
+ @printf " $(GREEN)monitor-docker-logs-db$(NC) - Docker logs for DB exporter (local)\n"
+ @printf " $(GREEN)monitor-docker-logs-db-prod$(NC) - Docker logs for DB exporter (prod)\n"
+ @printf " $(GREEN)monitor-pull$(NC) - Pull latest monitoring images (local)\n"
+ @printf " $(GREEN)monitor-pull-prod$(NC) - Pull latest monitoring images (prod)\n\n"
+ @printf "$(BOLD)$(BLUE)Status & Logs:$(NC)\n"
+ @printf " $(GREEN)monitor-status$(NC) - Show status of monitoring services\n"
+ @printf " $(GREEN)monitor-logs$(NC) - Show logs from all services (local)\n"
+ @printf " $(GREEN)monitor-logs-prod$(NC) - Show logs from all services (prod)\n\n"
+ @printf "$(BOLD)$(BLUE)Testing:$(NC)\n"
+ @printf " $(GREEN)monitor-test$(NC) - Run full test suite (local only)\n"
+ @printf " $(GREEN)monitor-targets$(NC) - Show Prometheus targets status\n"
+ @printf " $(GREEN)monitor-traffic$(NC) - Generate test traffic (local)\n"
+ @printf " $(GREEN)monitor-traffic-heavy$(NC) - Generate heavy test traffic (local)\n"
+ @printf " $(GREEN)monitor-traffic-prod$(NC) - Generate test traffic (production)\n"
+ @printf " $(GREEN)monitor-traffic-heavy-prod$(NC) - Generate heavy test traffic (prod)\n\n"
+ @printf "$(BOLD)$(BLUE)Access:$(NC)\n"
+ @printf " $(GREEN)monitor-grafana$(NC) - Open Grafana in browser\n"
+ @printf " $(GREEN)monitor-prometheus$(NC) - Open Prometheus in browser\n"
+ @printf " $(GREEN)monitor-metrics$(NC) - Show all metrics endpoints\n"
+ @printf " $(GREEN)monitor-caddy-metrics$(NC) - Show Caddy metrics\n"
+ @printf " $(GREEN)monitor-api-metrics$(NC) - Show API metrics\n"
+ @printf " $(GREEN)monitor-db-metrics$(NC) - Show PostgreSQL metrics (local)\n"
+ @printf " $(GREEN)monitor-db-metrics-prod$(NC) - Show PostgreSQL metrics (prod)\n\n"
+ @printf "$(BOLD)$(BLUE)Utilities:$(NC)\n"
+ @printf " $(GREEN)monitor-stats$(NC) - Show resource usage (local)\n"
+ @printf " $(GREEN)monitor-stats-prod$(NC) - Show resource usage (prod)\n"
+ @printf " $(GREEN)monitor-config$(NC) - Show Prometheus config (local)\n"
+ @printf " $(GREEN)monitor-config-prod$(NC) - Show Prometheus config (prod)\n"
+ @printf " $(GREEN)monitor-backup$(NC) - Backup Prometheus data (local)\n"
+ @printf " $(GREEN)monitor-backup-prod$(NC) - Backup Prometheus data (prod)\n"
+ @printf " $(GREEN)monitor-export-dashboards$(NC) - Export Grafana dashboards to JSON\n"
+ @printf " $(GREEN)monitor-clean$(NC) - Clean all monitoring data (local)\n"
+ @printf " $(GREEN)monitor-clean-prod$(NC) - Clean all monitoring data (prod)\n\n"
+ @printf "$(BOLD)Quick Start:$(NC)\n"
+ @printf " 1. $(YELLOW)make monitor-up$(NC) - Start the stack\n"
+ @printf " 2. $(YELLOW)make monitor-test$(NC) - Verify everything works\n"
+ @printf " 3. $(YELLOW)make monitor-traffic$(NC) - Generate some traffic\n"
+ @printf " 4. $(YELLOW)make monitor-grafana$(NC) - Open dashboards\n\n"
+ @printf "$(BOLD)Docker Compose Examples:$(NC)\n"
+ @printf " $(YELLOW)docker compose --profile local up -d$(NC) - Start local stack\n"
+ @printf " $(YELLOW)docker compose --profile prod up -d$(NC) - Start prod stack\n"
+ @printf " $(YELLOW)docker ps --filter name=prometheus$(NC) - List containers\n"
+ @printf " $(YELLOW)docker exec -it oullin_prometheus_local /bin/sh$(NC) - Shell access\n\n"
diff --git a/infra/metrics/README.md b/infra/metrics/README.md
new file mode 100644
index 00000000..81112578
--- /dev/null
+++ b/infra/metrics/README.md
@@ -0,0 +1,712 @@
+# Monitoring Stack Documentation
+
+Complete guide for managing and monitoring the Oullin application stack with Prometheus, Grafana, and related tools.
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Quick Start](#quick-start)
+3. [Security Model](#security-model)
+4. [Grafana Dashboards](#grafana-dashboards)
+5. [Creating Custom Dashboards](#creating-custom-dashboards)
+6. [Prometheus Queries](#prometheus-queries)
+7. [Troubleshooting](#troubleshooting)
+8. [Maintenance & Backup](#maintenance--backup)
+9. [Resources](#resources)
+
+**For VPS deployment instructions, see [VPS_DEPLOYMENT.md](./VPS_DEPLOYMENT.md)**
+
+---
+
+## Overview
+
+### Stack Components
+
+- **Prometheus**: Metrics collection and time-series storage
+- **Grafana**: Visualization dashboards and alerting
+- **postgres_exporter**: PostgreSQL database metrics
+- **Caddy Admin API**: Reverse proxy metrics
+
+### Pre-configured Dashboards
+
+Three dashboards are automatically provisioned:
+
+1. **Oullin - Overview** (`grafana/dashboards/oullin-overview-oullin-overview.json`)
+ - Caddy request rate
+ - PostgreSQL active connections
+ - HTTP requests by status code
+ - API memory usage and goroutines
+
+2. **PostgreSQL - Database Metrics** (`grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json`)
+ - Active connections
+ - Database size
+ - Transaction rates
+ - Cache hit ratio
+ - Lock statistics
+
+3. **Caddy - Proxy Metrics** (`grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json`)
+ - Total request rate
+ - Response time percentiles
+ - Requests by status code
+ - Traffic rate
+ - Request errors
+
+### Directory Structure
+
+```text
+infra/metrics/
+├── README.md # This file
+├── grafana/
+│ ├── dashboards/ # Dashboard JSON files
+│ ├── provisioning/
+│ │ ├── dashboards/ # Dashboard provisioning config
+│ │ └── datasources/ # Data source configuration
+│ └── scripts/
+│ └── export-dashboards.sh
+└── prometheus/
+ ├── provisioning/
+ │ ├── prometheus.yml # Production Prometheus config
+ │ └── prometheus.local.yml # Local Prometheus config
+ └── scripts/
+ └── postgres-exporter-entrypoint.sh
+```
+
+### Configuration Consistency
+
+The monitoring stack is designed to maintain configuration consistency across local and production environments while respecting environment-specific differences.
+
+#### Shared Configuration Elements
+
+The following configurations are **identical** across both environments:
+
+1. **Grafana Settings:**
+ - Same Grafana version (`grafana/grafana:11.4.0`)
+ - Identical security settings (admin user, sign-up disabled, anonymous disabled)
+ - Same dashboard and datasource provisioning structure
+ - Same volume mount paths
+
+2. **Prometheus Core Settings:**
+ - Same Prometheus version (`prom/prometheus:v3.0.1`)
+ - Identical scrape interval (15s) and evaluation interval (15s)
+ - Same job structure (caddy, postgresql, api, prometheus) with per-environment targets
+ - Same metrics endpoints and paths
+
+3. **Postgres Exporter:**
+ - Same exporter version (`prometheuscommunity/postgres-exporter:v0.15.0`)
+ - Identical port exposure (9187)
+ - Same entrypoint script and secrets handling
+
+#### Environment-Specific Variables
+
+These settings **differ intentionally** based on environment:
+
+| Configuration | Local | Production | Reason |
+|--------------|-------|------------|--------|
+| **Container Names** | `oullin_*_local` | `oullin_*` | Distinguish environments |
+| **Prometheus URL** | `oullin_prometheus_local:9090` | `oullin_prometheus:9090` | Network addressing |
+| **Grafana Port** | `3000:3000` | `127.0.0.1:3000:3000` | Security (prod localhost-only) |
+| **Prometheus Port** | `9090:9090` | `127.0.0.1:9090:9090` | Security (prod localhost-only) |
+| **Data Retention** | 7 days | 30 days | Storage/cost optimization |
+| **Caddy Target** | `caddy_local:9180` | `caddy_prod:9180` | Service dependencies |
+| **PostgreSQL Exporter Target** | `oullin_postgres_exporter_local:9187` | `oullin_postgres_exporter:9187` | Service dependencies |
+| **External Labels** | `monitor: 'oullin-local'`
`environment: 'local'` | `monitor: 'oullin-prod'`
`environment: 'production'` | Metric identification |
+| **Admin API** | `127.0.0.1:2019:2019` | Not exposed | Debugging access |
+
+#### Environment Variable Usage
+
+The configuration uses environment variables to maintain consistency while adapting to each environment:
+
+**Grafana Datasource** (`grafana/provisioning/datasources/prometheus.yml`):
+```yaml
+url: ${GF_DATASOURCE_PROMETHEUS_URL}
+```
+
+Set via Docker Compose:
+- **Local:** `GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus_local:9090`
+- **Production:** `GF_DATASOURCE_PROMETHEUS_URL=http://oullin_prometheus:9090`
+
+**Required Environment Variables:**
+- `GRAFANA_ADMIN_PASSWORD` - **Required**, no default (set in `.env`)
+- `GF_DATASOURCE_PROMETHEUS_URL` - Set automatically by Docker Compose profile
+
+#### Configuration Files by Environment
+
+**Local Environment:**
+- Prometheus: `prometheus/provisioning/prometheus.local.yml`
+- Profile: `--profile local`
+- Services: `prometheus_local`, `grafana_local`, `caddy_local`, `postgres_exporter_local`
+
+**Production Environment:**
+- Prometheus: `prometheus/provisioning/prometheus.yml`
+- Profile: `--profile prod`
+- Services: `prometheus`, `grafana`, `caddy_prod`, `postgres_exporter`
+
+**Shared Across All Environments:**
+- Grafana datasources: `grafana/provisioning/datasources/prometheus.yml`
+- Grafana dashboards: `grafana/provisioning/dashboards/default.yml`
+- Dashboard JSONs: `grafana/dashboards/*.json`
+- Postgres exporter script: `prometheus/scripts/postgres-exporter-entrypoint.sh`
+
+---
+
+## Quick Start
+
+### Local Development
+
+**Prerequisites:**
+- Docker and Docker Compose installed
+- `.env` file in the repository root with `GRAFANA_ADMIN_PASSWORD` set (required - no default)
+ - Use `make env:init` to copy `.env.example` if you need a starting point
+ - If `.env` already exists, edit it in place instead of appending duplicates
+- Database secrets in `database/infra/secrets/`
+
+**Setup:**
+
+```bash
+# 1. Set Grafana admin password in .env file
+echo "GRAFANA_ADMIN_PASSWORD=$(openssl rand -base64 32)" >> .env
+# (Add or update the key manually if the file already defines it.)
+
+# 2. Start the local monitoring stack
+make monitor-up
+# Or: docker compose --profile local up -d
+
+# 3. Access services
+# Grafana: http://localhost:3000 (admin / your-password)
+# Prometheus: http://localhost:9090
+# Caddy Admin: http://localhost:2019
+```
+
+**Verification:**
+
+```bash
+# Check all services are running
+docker ps
+
+# Verify Prometheus targets are UP
+make monitor-targets
+# Or: curl http://localhost:9090/api/v1/targets
+
+# Generate test traffic
+make monitor-traffic
+
+# View dashboards
+make monitor-grafana
+```
+
+---
+
+## Security Model
+
+### Critical Security Requirements
+
+⚠️ **IMPORTANT**: The monitoring stack includes several security considerations:
+
+1. **Grafana Admin Password**
+ - No default password allowed
+ - Must set `GRAFANA_ADMIN_PASSWORD` in `.env`
+ - Docker Compose will fail if not set
+ - Generate strong password: `openssl rand -base64 32`
+
+2. **Caddy Admin API**
+ - Exposes powerful administrative endpoints (`/load`, `/config`, `/stop`)
+ - **NO authentication** by default
+ - Production: Only accessible within Docker network; restrict further via firewalls/security groups when possible
+ - If you must expose it, configure Caddy's admin access controls (`admin.identity`, `admin.authorize`, or reverse-proxy ACLs) to require authentication
+ - Never expose to public internet
+
+3. **Service Exposure**
+ - Production: Services bound to `127.0.0.1` only
+ - Access via SSH tunneling from remote
+ - No direct internet exposure
+
+### Production Security Configuration
+
+**Docker Compose Production Services:**
+
+```yaml
+grafana:
+ ports:
+ - "127.0.0.1:3000:3000" # Localhost only
+
+prometheus:
+ ports:
+ - "127.0.0.1:9090:9090" # Localhost only
+
+caddy_prod:
+ expose:
+ - "2019" # Internal network only - NOT exposed to host
+```
+
+**Remote Access:**
+
+```bash
+# SSH tunnel for Grafana and Prometheus
+ssh -L 3000:localhost:3000 -L 9090:localhost:9090 user@your-server
+
+# Access Caddy admin API (debugging only)
+docker exec -it oullin_proxy_prod curl http://localhost:2019/metrics
+```
+
+### Security Checklist
+
+- ✅ `GRAFANA_ADMIN_PASSWORD` set with strong password
+- ✅ Firewall configured (UFW)
+- ✅ Only necessary ports exposed (22, 80, 443)
+- ✅ Monitoring services NOT exposed to internet
+- ✅ Docker secrets for sensitive data
+- ✅ Regular backups scheduled
+- ✅ Log rotation configured
+- ✅ SSH key-based authentication
+
+---
+
+## Grafana Dashboards
+
+### Accessing Dashboards
+
+**Local:**
+**Production:** SSH tunnel then
+
+### Dashboard Files
+
+All dashboards are in `infra/metrics/grafana/dashboards/`:
+- `oullin-overview-oullin-overview.json`
+- `oullin-postgresql-postgresql-database-metrics.json`
+- `oullin-caddy-caddy-proxy-metrics.json`
+
+### Exporting Dashboards
+
+Use the built-in export script:
+
+```bash
+make monitor-export-dashboards
+```
+
+This will:
+1. List all dashboards in Grafana
+2. Let you select which to export
+3. Save to `infra/metrics/grafana/dashboards/`
+4. Format properly for provisioning
+
+### Manual Export
+
+1. Open your dashboard in Grafana
+2. Click **"Share"** → **"Export"** tab
+3. Click **"Save to file"** or **"View JSON"**
+4. Save to `infra/metrics/grafana/dashboards/`
+5. Restart Grafana: `make monitor-restart`
+
+### Updating Dashboards Safely
+
+To keep dashboard changes reproducible and under version control:
+
+1. **Start monitoring stack**: `make monitor-up`
+2. **Make changes in Grafana UI**: Navigate to and edit dashboards
+3. **Export your changes**: Run `./infra/metrics/grafana/scripts/export-dashboards.sh`
+ - Select specific dashboard or `all` to export all dashboards
+ - Exports are saved to `infra/metrics/grafana/dashboards/`
+4. **Review the diff**: `git diff infra/metrics/grafana/dashboards/`
+5. **Commit changes**: Add and commit the exported JSON files
+6. **Verify**: `make monitor-restart` to ensure dashboards reload correctly
+
+**Warning:** Always export after making UI changes—manual edits to JSON files can work but are error-prone.
+
+---
+
+## Creating Custom Dashboards
+
+### Method 1: Create in UI (Recommended)
+
+**Step 1:** Start Grafana
+
+```bash
+make monitor-up
+make monitor-grafana # Opens http://localhost:3000
+```
+
+**Step 2:** Create dashboard
+
+1. Click **"+"** → **"Dashboard"** → **"Add visualization"**
+2. Select **"Prometheus"** as data source
+3. Write PromQL query
+4. Choose visualization type (Time series, Stat, Gauge, Table)
+5. Configure panel (title, description, units, thresholds)
+6. Add more panels as needed
+7. Save dashboard
+
+**Step 3:** Export
+
+```bash
+make monitor-export-dashboards
+```
+
+### Method 2: Use Community Dashboards
+
+Grafana has thousands of pre-built dashboards at
+
+**Popular for our stack:**
+- [9628](https://grafana.com/grafana/dashboards/9628) - PostgreSQL Database
+- [455](https://grafana.com/grafana/dashboards/455) - PostgreSQL Stats
+- [10826](https://grafana.com/grafana/dashboards/10826) - Go Metrics
+- [6671](https://grafana.com/grafana/dashboards/6671) - Go Processes
+
+**Import via UI:**
+1. Click **"+"** → **"Import"**
+2. Enter dashboard ID
+3. Select **"Prometheus"** as data source
+4. Click **"Import"**
+
+### Dashboard Best Practices
+
+**Organization:**
+- One dashboard per service
+- Overview dashboard for high-level metrics
+- Detail dashboards for deep dives
+- Use tags for categorization
+
+**Panel Design:**
+- Clear titles
+- Descriptions for complex metrics
+- Consistent colors
+- Appropriate units (bytes, %, req/s)
+- Thresholds for warnings/errors
+
+**Query Performance:**
+- Avoid high-cardinality labels
+- Use recording rules for expensive queries
+- Limit time range
+- Use `rate()` instead of raw counters
+
+---
+
+## Prometheus Queries
+
+### API Metrics
+
+```promql
+# Request rate
+rate(promhttp_metric_handler_requests_total[5m])
+
+# Memory usage
+go_memstats_alloc_bytes{job="api"}
+
+# Goroutines (check for leaks)
+go_goroutines{job="api"}
+
+# GC duration
+rate(go_gc_duration_seconds_sum[5m])
+
+# Heap allocations
+rate(go_memstats_alloc_bytes_total[5m])
+```
+
+### PostgreSQL Metrics
+
+```promql
+# Active connections
+pg_stat_database_numbackends
+
+# Database size
+pg_database_size_bytes
+
+# Transaction rate
+rate(pg_stat_database_xact_commit[5m])
+
+# Cache hit ratio (should be >90%)
+rate(pg_stat_database_blks_hit[5m]) /
+(rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))
+
+# Rows inserted/updated/deleted
+rate(pg_stat_database_tup_inserted[5m])
+rate(pg_stat_database_tup_updated[5m])
+rate(pg_stat_database_tup_deleted[5m])
+```
+
+### Caddy Metrics
+
+```promql
+# Request rate by status
+sum by(code) (rate(caddy_http_requests_total[5m]))
+
+# Response time percentiles
+histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))
+histogram_quantile(0.99, rate(caddy_http_request_duration_seconds_bucket[5m]))
+
+# Error rate
+sum(rate(caddy_http_request_errors_total[5m]))
+
+# Response traffic rate
+rate(caddy_http_response_size_bytes_sum[5m])
+```
+
+---
+
+## Troubleshooting
+
+### Dashboards Don't Load
+
+```bash
+# Check JSON syntax
+jq . < infra/metrics/grafana/dashboards/my-dashboard.json
+
+# Check Grafana logs
+docker logs oullin_grafana_local # Local
+docker logs oullin_grafana # Production
+
+# Or view all monitoring logs
+make monitor-logs # Local
+make monitor-logs-prod # Production
+
+# Verify Prometheus connection
+# Grafana UI → Settings → Data Sources → Prometheus → "Save & Test"
+
+# Ensure Prometheus is running
+docker ps | grep prometheus
+```
+
+### No Data in Panels
+
+```bash
+# Verify Prometheus is scraping targets
+make monitor-targets
+# Or: curl http://localhost:9090/api/v1/targets
+
+# Test query in Prometheus
+# Open http://localhost:9090
+
+# Wait a few minutes for initial data collection
+```
+
+### Prometheus Not Scraping
+
+```bash
+# Check network connectivity
+docker exec -it oullin_prometheus_local ping caddy_local
+
+# Verify service exposes metrics
+docker exec -it oullin_prometheus_local curl http://caddy_local:2019/metrics
+
+# Check Prometheus config
+docker exec -it oullin_prometheus_local cat /etc/prometheus/prometheus.yml
+```
+
+### Targets Show as DOWN
+
+```bash
+# Check container networking
+docker network ls
+docker network inspect caddy_net
+
+# Check container names match Prometheus config
+docker ps
+
+# Restart services
+make monitor-restart
+# Or: docker compose --profile local restart
+```
+
+### High Memory Usage
+
+```bash
+# Monitor memory
+docker stats
+
+# If Prometheus using too much memory:
+# - Reduce retention time
+# - Decrease scrape frequency
+# - Add metric filters
+```
+
+### Data Not Persisting
+
+```bash
+# Ensure volumes are configured
+docker volume ls
+docker volume inspect prometheus_data_local # Local
+docker volume inspect prometheus_data_prod # Production
+docker volume inspect grafana_data_local # Local
+docker volume inspect grafana_data_prod # Production
+```
+
+---
+
+## Maintenance & Backup
+
+### Backing Up Data
+
+**Automated backup** (recommended):
+
+```bash
+# Runs daily via cron, keeps last 5 backups
+make monitor-backup # Local environment
+make monitor-backup-prod # Production environment
+```
+
+Backups saved to:
+- **Local**: `storage/monitoring/backups/prometheus-backup-YYYYMMDD-HHMMSS.tar.gz`
+- **Production**: `storage/monitoring/backups/prometheus-prod-backup-YYYYMMDD-HHMMSS.tar.gz`
+
+**Manual backup:**
+
+```bash
+# Backup Prometheus data
+docker run --rm -v prometheus_data_local:/data -v $(pwd)/backups:/backup alpine \
+ tar czf /backup/prometheus-backup-$(date +%Y%m%d-%H%M%S).tar.gz /data
+# (Use prometheus_data_prod on production hosts)
+
+# Backup Grafana data
+docker run --rm -v grafana_data_local:/data -v $(pwd)/backups:/backup alpine \
+ tar czf /backup/grafana-backup-$(date +%Y%m%d-%H%M%S).tar.gz /data
+# (Use grafana_data_prod on production hosts)
+```
+
+### Restoring from Backup
+
+```bash
+# Stop services
+make monitor-down
+
+# Restore Prometheus data
+# WARNING: This will DELETE all existing Prometheus data. Validate backups and consider restoring in a test environment first.
+docker run --rm -v prometheus_data_local:/data -v $(pwd)/backups:/backup alpine \
+ sh -c "rm -rf /data/* && tar xzf /backup/prometheus-backup-YYYYMMDD-HHMMSS.tar.gz -C /"
+# (Use prometheus_data_prod on production hosts)
+
+# Restore Grafana data
+# WARNING: This will DELETE all existing Grafana data. Keep a secondary backup if unsure.
+docker run --rm -v grafana_data_local:/data -v $(pwd)/backups:/backup alpine \
+ sh -c "rm -rf /data/* && tar xzf /backup/grafana-backup-YYYYMMDD-HHMMSS.tar.gz -C /"
+# (Use grafana_data_prod on production hosts)
+
+# Restart services
+make monitor-up
+```
+
+### Updating the Stack
+
+**Local environment:**
+```bash
+# Pull latest images
+docker compose pull
+
+# Restart with new images
+make monitor-restart
+# Or: docker compose --profile local up -d
+```
+
+**Production environment:**
+```bash
+# Pull latest images
+docker compose pull
+
+# Restart with new images
+make monitor-restart-prod
+# Or: docker compose --profile prod up -d
+```
+
+### Monitoring Resource Usage
+
+```bash
+# CPU and Memory usage
+docker stats
+
+# Disk usage by container
+docker system df -v
+
+# Container logs size
+sudo du -sh /var/lib/docker/containers/*/*-json.log
+```
+
+### Cleaning Up Old Data
+
+Prometheus automatically handles retention based on `--storage.tsdb.retention.time` (30d prod, 7d local).
+
+Manual cleanup:
+
+```bash
+# Stop Prometheus
+docker compose stop prometheus_local
+
+# Clean data
+docker run --rm -v prometheus_data_local:/data alpine rm -rf /data/*
+# (Use prometheus_data_prod on production hosts)
+
+# Restart
+docker compose --profile local up -d prometheus_local
+```
+
+---
+
+## Resources
+
+### Official Documentation
+
+- [Prometheus Documentation](https://prometheus.io/docs/)
+- [Grafana Documentation](https://grafana.com/docs/)
+- [Grafana Dashboards](https://grafana.com/grafana/dashboards/)
+- [Caddy Metrics](https://caddyserver.com/docs/metrics)
+- [PostgreSQL Exporter](https://github.com/prometheus-community/postgres_exporter)
+- [PromQL Basics](https://prometheus.io/docs/prometheus/latest/querying/basics/)
+- [Grafonnet Library](https://github.com/grafana/grafonnet-lib)
+
+### Quick Reference Commands
+
+```bash
+# Start monitoring stack
+make monitor-up # Local
+make monitor-up-prod # Production
+
+# Access services
+make monitor-grafana # Open Grafana
+make monitor-prometheus # Open Prometheus
+
+# Check status
+make monitor-status # Service health
+make monitor-targets # Prometheus targets
+
+# Generate traffic
+make monitor-traffic # Local
+make monitor-traffic-prod # Production
+
+# View logs
+make monitor-logs # All services (local)
+make monitor-logs-prod # All services (production)
+
+# Individual container logs
+docker logs oullin_grafana_local # Grafana (local)
+docker logs oullin_prometheus_local # Prometheus (local)
+docker logs oullin_grafana # Grafana (production)
+docker logs oullin_prometheus # Prometheus (production)
+
+# Maintenance
+make monitor-backup # Backup Prometheus data
+make monitor-restart # Restart services (local)
+make monitor-restart-prod # Restart services (production)
+make monitor-export-dashboards
+
+# Cleanup
+make monitor-down # Stop services (local)
+make monitor-down-prod # Stop services (production)
+make monitor-clean # Clean up data (local)
+make monitor-clean-prod # Clean up data (production)
+```
+
+### Production Deployment
+
+For complete VPS deployment instructions including firewall setup, SSL configuration, and production best practices, see [VPS_DEPLOYMENT.md](./VPS_DEPLOYMENT.md).
+
+---
+
+## Next Steps
+
+1. **Set up Alerting**: Configure Prometheus Alertmanager for critical metrics
+2. **Add Custom Metrics**: Instrument your API with custom business metrics
+3. **Create Custom Dashboards**: Build dashboards specific to your use case
+4. **Configure Recording Rules**: Pre-compute expensive queries
+5. **Implement SLOs**: Define and monitor Service Level Objectives
+6. **Export and Share**: Share dashboard configurations with your team
+
+---
+
+For questions or issues, please check the [Troubleshooting](#troubleshooting) section or refer to the official documentation links above.
diff --git a/infra/metrics/VPS_DEPLOYMENT.md b/infra/metrics/VPS_DEPLOYMENT.md
new file mode 100644
index 00000000..11ca467d
--- /dev/null
+++ b/infra/metrics/VPS_DEPLOYMENT.md
@@ -0,0 +1,436 @@
+# VPS Deployment Guide
+
+Complete guide for deploying the Oullin monitoring stack on an Ubuntu VPS (Hostinger or similar).
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Initial Server Setup](#initial-server-setup)
+3. [Install Docker and Docker Compose](#install-docker-and-docker-compose)
+4. [Install Make](#install-make)
+5. [Clone Your Repository](#clone-your-repository)
+6. [Configure Environment Variables](#configure-environment-variables)
+7. [Set Up Docker Secrets](#set-up-docker-secrets)
+8. [Configure Firewall](#configure-firewall)
+9. [Deploy the Monitoring Stack](#deploy-the-monitoring-stack)
+10. [Verify Monitoring Stack](#verify-monitoring-stack)
+11. [Access Grafana Remotely](#access-grafana-remotely)
+12. [Production Considerations](#production-considerations)
+13. [Generate Test Traffic](#generate-test-traffic)
+14. [VPS Troubleshooting](#vps-troubleshooting)
+15. [Updating the Stack](#updating-the-stack)
+16. [Installing Fail2ban](#installing-fail2ban)
+
+---
+
+## Prerequisites
+
+- Hostinger VPS with Ubuntu 20.04 or 22.04 (or similar VPS provider)
+- SSH access to your VPS
+- Domain name (optional, but recommended for SSL)
+- At least 2GB RAM and 20GB storage
+
+---
+
+## Initial Server Setup
+
+Connect to your VPS:
+
+```bash
+ssh root@your-vps-ip
+```
+
+Update the system:
+
+```bash
+apt update && apt upgrade -y
+```
+
+Create a non-root user:
+
+```bash
+# Create user
+adduser deployer
+
+# Add to sudo group
+usermod -aG sudo deployer
+
+# Switch to new user
+su - deployer
+```
+
+---
+
+## Install Docker and Docker Compose
+
+Install required packages:
+
+```bash
+sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
+```
+
+Add Docker's official GPG key:
+
+```bash
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+```
+
+Add Docker repository:
+
+```bash
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+```
+
+Install Docker:
+
+```bash
+sudo apt update
+sudo apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
+```
+
+Add your user to the docker group:
+
+```bash
+sudo usermod -aG docker ${USER}
+```
+
+Log out and back in, then verify:
+
+```bash
+docker --version
+docker compose version
+```
+
+---
+
+## Install Make
+
+```bash
+sudo apt install -y make
+```
+
+---
+
+## Clone Your Repository
+
+```bash
+cd ~
+git clone https://github.com/yourusername/your-repo.git
+cd your-repo
+```
+
+---
+
+## Configure Environment Variables
+
+Create your `.env` file with production settings:
+
+```bash
+cat > .env << 'EOF'
+# Database Configuration
+POSTGRES_USER=your_db_user
+POSTGRES_PASSWORD=your_strong_db_password
+POSTGRES_DB=your_database_name
+
+# Grafana Configuration (REQUIRED - no default)
+GRAFANA_ADMIN_PASSWORD=your_very_strong_grafana_password
+
+# Production Domain (optional, for SSL)
+DOMAIN=your-domain.com
+
+# Environment
+ENVIRONMENT=production
+EOF
+```
+
+**Security Notes:**
+- Use strong, unique passwords
+- Never commit `.env` to version control
+- Consider using a password manager
+
+---
+
+## Set Up Docker Secrets
+
+Avoid piping credentials through `echo` because the literal values end up in your shell history. Use one of the safer patterns below.
+
+### Option 1: Read secrets from secure input
+
+```bash
+# Prompt won't echo characters and won't touch shell history
+read -s -p "Enter database password: " DB_PASSWORD && echo
+
+echo "$DB_PASSWORD" | docker secret create pg_password - 2>/dev/null || \
+ printf "%s" "$DB_PASSWORD" > secrets/pg_password
+
+unset DB_PASSWORD
+```
+
+Repeat the same pattern for usernames or other sensitive values you do not want stored on disk.
+
+### Option 2: Write files directly
+
+```bash
+mkdir -p secrets
+printf "your_db_user" > secrets/pg_username
+printf "your_strong_db_password" > secrets/pg_password
+printf "your_database_name" > secrets/pg_dbname
+chmod 600 secrets/*
+```
+
+Store these files somewhere secure (e.g., `pass`, `1Password CLI`, `sops`) and only copy them onto the server when needed.
+
+---
+
+## Configure Firewall
+
+Set up UFW:
+
+```bash
+# Enable UFW
+sudo ufw --force enable
+
+# Allow SSH (IMPORTANT: Do this first!)
+sudo ufw allow 22/tcp
+
+# Allow HTTP and HTTPS (for Caddy)
+sudo ufw allow 80/tcp
+sudo ufw allow 443/tcp
+
+# Verify rules
+sudo ufw status
+```
+
+**Do NOT expose Prometheus (9090), Grafana (3000), or postgres_exporter (9187) ports!**
+
+---
+
+## Deploy the Monitoring Stack
+
+```bash
+# Start with production profile
+make monitor-up-prod
+# Or: docker compose --profile prod up -d
+```
+
+Verify services:
+
+```bash
+docker compose ps
+```
+
+Expected containers:
+- `oullin_prometheus`
+- `oullin_grafana`
+- `oullin_postgres_exporter`
+- `oullin_proxy_prod`
+- `oullin_db`
+
+---
+
+## Verify Monitoring Stack
+
+Check Prometheus targets:
+
+```bash
+curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
+```
+
+All should show `"health": "up"`.
+
+---
+
+## Access Grafana Remotely
+
+From your local machine:
+
+```bash
+ssh -L 3000:localhost:3000 deployer@your-vps-ip
+```
+
+Then open `http://localhost:3000` in your browser.
+
+**Login:**
+- Username: `admin`
+- Password: Value from `GRAFANA_ADMIN_PASSWORD`
+
+---
+
+## Production Considerations
+
+### Enable Automatic Backups
+
+Schedule daily backups:
+
+```bash
+crontab -e
+```
+
+Add:
+
+# NOTE: Update /home/deployer/your-repo to your actual repository path
+```cron
+# Run daily at 2 AM
+0 2 * * * cd /home/deployer/your-repo && make monitor-backup-prod >> /var/log/prometheus-backup.log 2>&1
+```
+
+### Monitor Disk Space
+
+```bash
+# Check disk usage
+df -h
+
+# Check Prometheus data size
+docker exec oullin_prometheus du -sh /prometheus
+```
+
+### Configure Log Rotation
+
+```bash
+sudo tee /etc/docker/daemon.json > /dev/null << 'EOF'
+{
+ "log-driver": "json-file",
+ "log-opts": {
+ "max-size": "10m",
+ "max-file": "3"
+ }
+}
+EOF
+
+sudo systemctl restart docker
+make monitor-restart-prod
+```
+
+### Enable SSL/TLS (Optional)
+
+If you have a domain, configure Caddy for automatic HTTPS.
+
+Edit `infra/caddy/Caddyfile.prod`:
+
+```caddyfile
+your-domain.com {
+ reverse_proxy api:8080
+
+ log {
+ output file /var/log/caddy/access.log
+ }
+}
+
+# Admin API (internal only)
+127.0.0.1:2019 {
+ admin {
+ metrics
+ }
+}
+```
+
+Caddy will automatically obtain Let's Encrypt certificates.
+
+---
+
+## Generate Test Traffic
+
+```bash
+make monitor-traffic-prod
+```
+
+Wait a few minutes for data to appear in Grafana.
+
+---
+
+## VPS Troubleshooting
+
+### Services won't start
+
+```bash
+# View logs from monitoring services
+make monitor-logs # Local: all services
+make monitor-logs-prod # Production: all services
+
+# Or view individual container logs
+docker logs oullin_grafana
+docker logs oullin_prometheus
+
+# Check Docker daemon
+sudo systemctl status docker
+```
+
+### Can't connect via SSH tunnel
+
+```bash
+# Verify Grafana is listening
+docker exec oullin_grafana netstat -tlnp | grep 3000
+
+# Check if port is already in use locally
+lsof -i :3000
+```
+
+### Prometheus targets are down
+
+```bash
+# Check DNS resolution
+docker exec oullin_prometheus nslookup oullin_proxy_prod
+docker exec oullin_prometheus nslookup oullin_postgres_exporter
+
+# Verify network
+docker network inspect caddy_net oullin_net
+```
+
+### Out of disk space
+
+```bash
+# Clean up Docker
+docker system prune -a --volumes
+
+# Rotate backups (keeps last 5)
+make monitor-backup
+
+# Clear old Prometheus data
+docker exec oullin_prometheus rm -rf /prometheus/wal/*
+```
+
+---
+
+## Updating the Stack
+
+```bash
+cd ~/your-repo
+git pull origin main
+
+make monitor-down-prod
+make monitor-up-prod
+```
+
+---
+
+## Installing Fail2ban
+
+```bash
+sudo apt install -y fail2ban
+sudo systemctl start fail2ban
+sudo systemctl enable fail2ban
+sudo fail2ban-client status sshd
+```
+
+---
+
+## Production Checklist
+
+- ✅ `GRAFANA_ADMIN_PASSWORD` set in `.env`
+- ✅ Firewall configured (UFW)
+- ✅ Services bound to localhost
+- ✅ SSH tunneling configured
+- ✅ Backups scheduled (cron)
+- ✅ Log rotation configured
+- ✅ SSL/TLS enabled (if domain)
+- ✅ Fail2ban installed
+- ✅ All Prometheus targets UP
+- ✅ Dashboards accessible
+- ✅ Retention policies set
+- ✅ Volumes backed up regularly
+
+---
+
+## Additional Resources
+
+For monitoring-specific documentation, see [README.md](./README.md).
diff --git a/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json b/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json
new file mode 100644
index 00000000..47c068c4
--- /dev/null
+++ b/infra/metrics/grafana/dashboards/oullin-caddy-caddy-proxy-metrics.json
@@ -0,0 +1,482 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "reqps"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "sum(rate(caddy_http_request_duration_seconds_count[5m]))",
+ "legendFormat": "Requests/s",
+ "refId": "A"
+ }
+ ],
+ "title": "Total Request Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "s"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["mean"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))",
+ "legendFormat": "p95",
+ "refId": "A"
+ }
+ ],
+ "title": "Response Time (p95)",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "normal"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "reqps"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 4,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "sum by(code) (rate(caddy_http_request_duration_seconds_count[5m]))",
+ "legendFormat": "{{code}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Requests by Status Code",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "s"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 5,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.50, rate(caddy_http_request_duration_seconds_bucket[5m]))",
+ "legendFormat": "p50",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))",
+ "legendFormat": "p95",
+ "refId": "B"
+ },
+ {
+ "expr": "histogram_quantile(0.99, rate(caddy_http_request_duration_seconds_bucket[5m]))",
+ "legendFormat": "p99",
+ "refId": "C"
+ }
+ ],
+ "title": "Request Duration Percentiles",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 14
+ },
+ "id": 6,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "rate(caddy_http_response_size_bytes_sum[5m])",
+ "legendFormat": "Response",
+ "refId": "A"
+ }
+ ],
+ "title": "Response Traffic Rate",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 14
+ },
+ "id": 7,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "sum(rate(caddy_http_request_errors_total[5m])) or vector(0)",
+ "legendFormat": "Errors/s",
+ "refId": "A"
+ }
+ ],
+ "title": "Request Errors",
+ "type": "timeseries"
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["oullin", "caddy", "proxy"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "Caddy - Proxy Metrics",
+ "uid": "oullin-caddy",
+ "version": 1
+}
diff --git a/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json b/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json
new file mode 100644
index 00000000..1a2e4d5e
--- /dev/null
+++ b/infra/metrics/grafana/dashboards/oullin-overview-oullin-overview.json
@@ -0,0 +1,395 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "sum(rate(caddy_http_request_duration_seconds_count[5m]))",
+ "legendFormat": "Caddy Requests/s",
+ "refId": "A"
+ }
+ ],
+ "title": "Caddy Request Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 50
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_stat_database_numbackends{datname=~\".*\"}",
+ "legendFormat": "DB Connections",
+ "refId": "A"
+ }
+ ],
+ "title": "PostgreSQL Active Connections",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "reqps"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 8
+ },
+ "id": 3,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "sum by(code) (rate(caddy_http_request_duration_seconds_count[5m]))",
+ "legendFormat": "{{code}}",
+ "refId": "A"
+ }
+ ],
+ "title": "HTTP Requests by Status Code",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "id": 4,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "go_memstats_alloc_bytes{job=\"api\"}",
+ "legendFormat": "API Memory Usage",
+ "refId": "A"
+ }
+ ],
+ "title": "API Memory Usage",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "id": 5,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "go_goroutines{job=\"api\"}",
+ "legendFormat": "Goroutines",
+ "refId": "A"
+ }
+ ],
+ "title": "API Goroutines",
+ "type": "timeseries"
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["oullin", "overview"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "Oullin - Overview",
+ "uid": "oullin-overview",
+ "version": 1
+}
diff --git a/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json b/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json
new file mode 100644
index 00000000..abfc3662
--- /dev/null
+++ b/infra/metrics/grafana/dashboards/oullin-postgresql-postgresql-database-metrics.json
@@ -0,0 +1,600 @@
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "yellow",
+ "value": 50
+ },
+ {
+ "color": "red",
+ "value": 100
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_stat_database_numbackends",
+ "legendFormat": "{{datname}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Active Connections",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 6,
+ "y": 0
+ },
+ "id": 2,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_database_size_bytes",
+ "legendFormat": "{{datname}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Database Size",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 12,
+ "y": 0
+ },
+ "id": 3,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "rate(pg_stat_database_xact_commit[5m])",
+ "legendFormat": "Commits/s",
+ "refId": "A"
+ }
+ ],
+ "title": "Transaction Rate",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 6,
+ "x": 18,
+ "y": 0
+ },
+ "id": 4,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": ["lastNotNull"],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "auto"
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_stat_database_conflicts",
+ "legendFormat": "Conflicts",
+ "refId": "A"
+ }
+ ],
+ "title": "Conflicts",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "id": 5,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "rate(pg_stat_database_tup_inserted[5m])",
+ "legendFormat": "Inserts - {{datname}}",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(pg_stat_database_tup_updated[5m])",
+ "legendFormat": "Updates - {{datname}}",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(pg_stat_database_tup_deleted[5m])",
+ "legendFormat": "Deletes - {{datname}}",
+ "refId": "C"
+ }
+ ],
+ "title": "Database Operations",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "id": 6,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_stat_database_numbackends",
+ "legendFormat": "{{datname}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Active Connections Over Time",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "percentunit"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 14
+ },
+ "id": 7,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "rate(pg_stat_database_blks_hit[5m]) / (rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))",
+ "legendFormat": "{{datname}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Cache Hit Ratio",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "tooltip": false,
+ "viz": false,
+ "legend": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ }
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 14
+ },
+ "id": 8,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "11.4.0",
+ "targets": [
+ {
+ "expr": "pg_locks_count",
+ "legendFormat": "{{mode}} - {{datname}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Database Locks",
+ "type": "timeseries"
+ }
+ ],
+ "schemaVersion": 39,
+ "tags": ["oullin", "postgresql", "database"],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "PostgreSQL - Database Metrics",
+ "uid": "oullin-postgresql",
+ "version": 1
+}
diff --git a/infra/metrics/grafana/provisioning/dashboards/default.yml b/infra/metrics/grafana/provisioning/dashboards/default.yml
new file mode 100644
index 00000000..45fb2660
--- /dev/null
+++ b/infra/metrics/grafana/provisioning/dashboards/default.yml
@@ -0,0 +1,13 @@
+apiVersion: 1
+
+providers:
+ - name: 'Oullin Dashboards'
+ orgId: 1
+ folder: ''
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 10
+ allowUiUpdates: true
+ options:
+ path: /var/lib/grafana/dashboards
+ foldersFromFilesStructure: true
diff --git a/infra/metrics/grafana/provisioning/datasources/prometheus.yml b/infra/metrics/grafana/provisioning/datasources/prometheus.yml
new file mode 100644
index 00000000..c9be740e
--- /dev/null
+++ b/infra/metrics/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,14 @@
+apiVersion: 1
+
+datasources:
+ - name: Prometheus
+ uid: prometheus
+ type: prometheus
+ access: proxy
+ url: ${GF_DATASOURCE_PROMETHEUS_URL}
+ isDefault: true
+ editable: true
+ allowUiUpdates: true
+ jsonData:
+ timeInterval: 15s
+ queryTimeout: 60s
diff --git a/infra/metrics/grafana/scripts/export-dashboards.sh b/infra/metrics/grafana/scripts/export-dashboards.sh
new file mode 100755
index 00000000..43e53a28
--- /dev/null
+++ b/infra/metrics/grafana/scripts/export-dashboards.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Helper script to export Grafana dashboards
+
+set -e
+
+GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}"
+GRAFANA_USER="${GRAFANA_USER:-admin}"
+GRAFANA_PASSWORD="${GRAFANA_PASSWORD:-admin}"
+OUTPUT_DIR="./infra/metrics/grafana/dashboards"
+
+echo "================================"
+echo "Grafana Dashboard Export Tool"
+echo "================================"
+echo ""
+
+# Check if Grafana is running
+if ! curl -s "$GRAFANA_URL/api/health" > /dev/null 2>&1; then
+ echo "Error: Grafana is not accessible at $GRAFANA_URL"
+ echo "Please start Grafana with: make monitor-up"
+ exit 1
+fi
+
+# List all dashboards
+echo "Fetching dashboard list..."
+DASHBOARDS=$(curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
+ "$GRAFANA_URL/api/search?type=dash-db" | jq -r '.[] | "\(.uid) \(.title)"')
+
+if [ -z "$DASHBOARDS" ]; then
+ echo "No dashboards found in Grafana"
+ exit 0
+fi
+
+echo ""
+echo "Available dashboards:"
+echo "---------------------"
+echo "$DASHBOARDS" | nl
+echo ""
+
+# Ask user which dashboard to export
+read -r -p "Enter dashboard number to export (or 'all' for all dashboards): " SELECTION
+
+# Validate selection
+if [ "$SELECTION" != "all" ]; then
+ # Check if selection is a valid number
+ if ! [[ "$SELECTION" =~ ^[0-9]+$ ]]; then
+ echo "Error: Please enter a valid number or 'all'"
+ exit 1
+ fi
+
+ # Check if selection is within valid range
+ DASHBOARD_COUNT=$(echo "$DASHBOARDS" | wc -l)
+ if [ "$SELECTION" -lt 1 ] || [ "$SELECTION" -gt "$DASHBOARD_COUNT" ]; then
+ echo "Error: Selection out of range (1-$DASHBOARD_COUNT)"
+ exit 1
+ fi
+fi
+
+if [ "$SELECTION" = "all" ]; then
+ # Export all dashboards
+ echo ""
+ echo "Exporting all dashboards..."
+
+ EXPORT_COUNT=0
+ FAIL_COUNT=0
+
+ while IFS= read -r line; do
+ UID=$(echo "$line" | awk '{print $1}')
+ TITLE=$(echo "$line" | cut -d' ' -f2-)
+ FILENAME="${UID}-$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd '[:alnum:]-').json"
+
+ echo -n "Exporting: $TITLE -> $FILENAME ... "
+
+ # Temporarily disable errexit for this operation
+ set +e
+ if curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
+ "$GRAFANA_URL/api/dashboards/uid/$UID" | \
+ jq 'del(.meta) | .dashboard.id = null | .overwrite = true' > \
+ "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then
+
+ # Verify the file is valid JSON and not empty
+ if [ -s "$OUTPUT_DIR/$FILENAME" ] && jq empty "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then
+ echo "✓ Success"
+ ((EXPORT_COUNT++))
+ else
+ echo "✗ Failed (invalid JSON)"
+ rm -f "$OUTPUT_DIR/$FILENAME"
+ ((FAIL_COUNT++))
+ fi
+ else
+ echo "✗ Failed (export error)"
+ rm -f "$OUTPUT_DIR/$FILENAME"
+ ((FAIL_COUNT++))
+ fi
+ set -e
+ done <<< "$DASHBOARDS"
+
+ echo ""
+ echo "Export summary: $EXPORT_COUNT succeeded, $FAIL_COUNT failed"
+
+ if [ $FAIL_COUNT -gt 0 ]; then
+ exit 1
+ fi
+
+else
+ # Export single dashboard
+ SELECTED_LINE=$(echo "$DASHBOARDS" | sed -n "${SELECTION}p")
+
+ if [ -z "$SELECTED_LINE" ]; then
+ echo "Error: Invalid selection"
+ exit 1
+ fi
+
+ UID=$(echo "$SELECTED_LINE" | awk '{print $1}')
+ TITLE=$(echo "$SELECTED_LINE" | cut -d' ' -f2-)
+ FILENAME="${UID}-$(echo "$TITLE" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd '[:alnum:]-').json"
+
+ echo ""
+ echo "Exporting: $TITLE"
+
+ # Temporarily disable errexit for this operation
+ set +e
+ if curl -s -u "$GRAFANA_USER:$GRAFANA_PASSWORD" \
+ "$GRAFANA_URL/api/dashboards/uid/$UID" | \
+ jq 'del(.meta) | .dashboard.id = null | .overwrite = true' > \
+ "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then
+
+ # Verify the file is valid JSON and not empty
+ if [ -s "$OUTPUT_DIR/$FILENAME" ] && jq empty "$OUTPUT_DIR/$FILENAME" 2>/dev/null; then
+ echo "✓ Saved to: $OUTPUT_DIR/$FILENAME"
+ else
+ echo "✗ Error: Export produced invalid JSON"
+ rm -f "$OUTPUT_DIR/$FILENAME"
+ exit 1
+ fi
+ else
+ echo "✗ Error: Failed to export dashboard"
+ rm -f "$OUTPUT_DIR/$FILENAME"
+ exit 1
+ fi
+ set -e
+fi
+
+echo ""
+echo "Export complete!"
+echo ""
+echo "To reload dashboards:"
+echo " make monitor-restart"
diff --git a/infra/metrics/prometheus/provisioning/prometheus.local.yml b/infra/metrics/prometheus/provisioning/prometheus.local.yml
new file mode 100644
index 00000000..4c661cbb
--- /dev/null
+++ b/infra/metrics/prometheus/provisioning/prometheus.local.yml
@@ -0,0 +1,41 @@
+# Prometheus configuration for local development/testing
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+ external_labels:
+ monitor: 'oullin-local'
+ environment: 'local'
+
+scrape_configs:
+ # Caddy metrics endpoint (dedicated /metrics endpoint, not admin API)
+ - job_name: 'caddy'
+ static_configs:
+ - targets: ['caddy_local:9180']
+ labels:
+ service: 'caddy'
+ environment: 'local'
+
+ # PostgreSQL database metrics via postgres_exporter (local)
+ - job_name: 'postgresql'
+ static_configs:
+ - targets: ['oullin_postgres_exporter_local:9187']
+ labels:
+ service: 'postgresql'
+ environment: 'local'
+
+ # API metrics endpoint (local)
+ - job_name: 'api'
+ metrics_path: '/metrics'
+ static_configs:
+ - targets: ['api:8080']
+ labels:
+ service: 'api'
+ environment: 'local'
+
+ # Prometheus self-monitoring
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+ labels:
+ service: 'prometheus'
+ environment: 'local'
diff --git a/infra/metrics/prometheus/provisioning/prometheus.yml b/infra/metrics/prometheus/provisioning/prometheus.yml
new file mode 100644
index 00000000..18ef3a2c
--- /dev/null
+++ b/infra/metrics/prometheus/provisioning/prometheus.yml
@@ -0,0 +1,41 @@
+# Prometheus configuration for monitoring Caddy, API, and PostgreSQL
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+ external_labels:
+ monitor: 'oullin-prod'
+ environment: 'production'
+
+scrape_configs:
+ # Caddy metrics endpoint (dedicated /metrics endpoint, not admin API)
+ - job_name: 'caddy'
+ static_configs:
+ - targets: ['caddy_prod:9180']
+ labels:
+ service: 'caddy'
+ environment: 'production'
+
+ # PostgreSQL database metrics via postgres_exporter
+ - job_name: 'postgresql'
+ static_configs:
+ - targets: ['oullin_postgres_exporter:9187']
+ labels:
+ service: 'postgresql'
+ environment: 'production'
+
+ # API metrics endpoint
+ - job_name: 'api'
+ metrics_path: '/metrics'
+ static_configs:
+ - targets: ['api:8080']
+ labels:
+ service: 'api'
+ environment: 'production'
+
+ # Prometheus self-monitoring
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+ labels:
+ service: 'prometheus'
+ environment: 'production'
diff --git a/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh b/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh
new file mode 100755
index 00000000..55f48fce
--- /dev/null
+++ b/infra/metrics/prometheus/scripts/postgres-exporter-entrypoint.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+set -e
+
+# URL-encode function using od and tr (POSIX-compliant)
+# Required for credentials containing special characters (@, :, /, ?, =)
+urlencode() {
+ string="$1"
+ printf '%s' "$string" | od -An -tx1 | tr ' ' % | tr -d '\n'
+}
+
+# Read Docker secrets separately for better error diagnostics
+PG_USER=$(cat /run/secrets/pg_username)
+PG_PASSWORD=$(cat /run/secrets/pg_password)
+PG_DBNAME=$(cat /run/secrets/pg_dbname)
+
+# Construct DATA_SOURCE_NAME with URL-encoded credentials
+export DATA_SOURCE_NAME="postgresql://$(urlencode "$PG_USER"):$(urlencode "$PG_PASSWORD")@api-db:5432/$(urlencode "$PG_DBNAME")?sslmode=require"
+
+# Execute postgres_exporter with any additional arguments
+exec /bin/postgres_exporter "$@"
diff --git a/metal/kernel/app.go b/metal/kernel/app.go
index f4066948..c4f10fb4 100644
--- a/metal/kernel/app.go
+++ b/metal/kernel/app.go
@@ -87,6 +87,7 @@ func (a *App) Boot() {
modem.KeepAlive()
modem.KeepAliveDB()
+ modem.Metrics()
modem.Profile()
modem.Experience()
modem.Projects()
diff --git a/metal/router/router.go b/metal/router/router.go
index 0c68015b..02dab599 100644
--- a/metal/router/router.go
+++ b/metal/router/router.go
@@ -92,6 +92,17 @@ func (r *Router) KeepAliveDB() {
r.Mux.HandleFunc("GET /ping-db", apiHandler)
}
+func (r *Router) Metrics() {
+ metricsHandler := handler.NewMetricsHandler()
+
+ // Metrics endpoint blocked from public access by Caddy (see @protected matcher in Caddyfile)
+ // Only accessible internally via direct container access (api:8080/metrics)
+ // Prometheus scrapes via internal DNS without going through Caddy's public listener
+ r.Mux.HandleFunc("GET /metrics", func(w http.ResponseWriter, req *http.Request) {
+ _ = metricsHandler.Handle(w, req)
+ })
+}
+
func (r *Router) Profile() {
maker := handler.NewProfileHandler
diff --git a/storage/monitoring/backups/.gitkeep b/storage/monitoring/backups/.gitkeep
new file mode 100644
index 00000000..5aab5f49
--- /dev/null
+++ b/storage/monitoring/backups/.gitkeep
@@ -0,0 +1 @@
+# Prometheus backups stored here