Improve production deployment

openradx · Jun 11, 2024 · 485939f · 485939f
1 parent c98e156
commit 485939f
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 42 deletions.
diff --git a/compose/docker-compose.base.yml b/compose/docker-compose.base.yml
@@ -23,15 +23,21 @@ x-app: &default-app
     FLOWER_PORT: "5555"
 
 services:
+  init:
+    <<: *default-app
+    hostname: init.local
+    volumes:
+      - radis_data:/var/www/radis
+      - /mnt:/mnt
+
   web:
     <<: *default-app
     hostname: web.local
     build:
       context: ..
     volumes:
-      # Cave, overwrites the above anchor
-      - /mnt:/mnt
       - radis_data:/var/www/radis
+      - /mnt:/mnt
 
   worker_default:
     <<: *default-app

diff --git a/compose/docker-compose.dev.yml b/compose/docker-compose.dev.yml
@@ -13,10 +13,10 @@ x-llamacpp: &llamacpp
   env_file:
     - ../.env.dev
   hostname: llamacpp.local
-  volumes:
-    - models_data:/models
   ports:
     - 9610:8080
+  volumes:
+    - models_data:/models
   entrypoint: "/bin/bash -c '/server -mu $${LLM_MODEL_URL} -c 512 --host 0.0.0.0 --port 8080'"
 
 services:

diff --git a/compose/docker-compose.prod.yml b/compose/docker-compose.prod.yml
@@ -1,5 +1,4 @@
 x-app: &default-app
-  restart: always
   image: radis_prod:latest
   env_file:
     - ../.env.prod
@@ -9,89 +8,107 @@ x-app: &default-app
     SSL_CERT_FILE: "/var/www/radis/ssl/cert.pem"
     SSL_KEY_FILE: "/var/www/radis/ssl/key.pem"
 
+x-deploy: &deploy
+  replicas: 1
+  restart_policy:
+    condition: on-failure
+    max_attempts: 3
+
 services:
-  web:
+  # We can't use those manage commands inside the web container in production because
+  # the web service may have multiple replicas. So we make sure to only run them once
+  # and wait for it to be finished by the web service containers.
+  init:
     <<: *default-app
-    build:
-      target: production
-    ports:
-      - "${RADIS_HTTP_PORT:-80}:80"
-      - "${RADIS_HTTPS_PORT:-443}:443"
+    hostname: init.local
     command: >
       bash -c "
-        wait-for-it -s postgres.local:5432 -t 60 && 
+        wait-for-it -s postgres.local:5432 -t 120 && 
         ./manage.py migrate &&
         ./manage.py collectstatic --no-input &&
         ./manage.py create_admin &&
         ./manage.py generate_cert &&
         wait-for-it -s opensearch-node1.local:9200 -t 60 &&
-        ./manage.py setup_opensearch &&
+        ./manage.py opensearch --mappings prod &&
         # wait-for-it -s vespa.local:19071 -t 60 &&
         # ./manage.py vespa --generate --deploy &&
+        ./manage.py ok_server --host 0.0.0.0 --port 8000
+      "
+    deploy:
+      <<: *deploy
+
+  web:
+    <<: *default-app
+    build:
+      target: production
+    ports:
+      - "${RADIS_HTTP_PORT:-80}:80"
+      - "${RADIS_HTTPS_PORT:-443}:443"
+    command: >
+      bash -c "
+        wait-for-it -s init.local:8000 -t 300 &&
         echo 'Starting web server ...'
         daphne -b 0.0.0.0 -p 80 -e ssl:443:privateKey=/var/www/radis/ssl/key.pem:certKey=/var/www/radis/ssl/cert.pem radis.asgi:application
       "
     deploy:
-      replicas: 1
+      <<: *deploy
+      replicas: 3
 
   worker_default:
     <<: *default-app
     command: ./manage.py celery_worker -Q default_queue
     deploy:
-      replicas: 1
+      <<: *deploy
 
   worker_vespa:
     <<: *default-app
     command: ./manage.py celery_worker -c 1 -Q vespa_queue
     deploy:
-      replicas: 0
+      <<: *deploy
 
   worker_llm:
     <<: *default-app
     command: ./manage.py celery_worker -c 1 -Q llm_queue
     deploy:
-      replicas: 1
+      <<: *deploy
 
   celery_beat:
     <<: *default-app
     command: ./manage.py celery_beat
     deploy:
-      replicas: 1
+      <<: *deploy
 
   flower:
     <<: *default-app
     deploy:
-      replicas: 1
+      <<: *deploy
 
   llamacpp-gpu:
-    restart: always
     image: ghcr.io/ggerganov/llama.cpp:server-cuda
-    environment:
-      MODEL_URL: https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.Q5_K_M.gguf
     hostname: llamacpp.local
+    env_file:
+      - ../.env.prod
     ports:
       - 9610:8080
     volumes:
       - models_data:/models
-    command: "-mu $${MODEL_URL} -m /models/model.gguf -cb -c 2048 --host 0.0.0.0 --port 8080"
+    entrypoint: "/bin/bash -c '/server -mu $${LLM_MODEL_URL} -cb -c 2048 --host 0.0.0.0 --port 8080'"
     deploy:
-      replicas: 1
+      # <<: *deploy
       resources:
         reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "gpu"
+                value: 1
 
   postgres:
-    restart: always
     env_file:
       - ../.env.prod
     deploy:
-      replicas: 1
+      <<: *deploy
 
   opensearch-node1:
-    restart: always
     image: opensearchproject/opensearch:2
     hostname: opensearch-node1.local
     environment:
@@ -116,10 +133,9 @@ services:
       - 9200:9200 # REST API
       - 9600:9600 # Performance Analyzer
     deploy:
-      replicas: 1
+      <<: *deploy
 
   opensearch-node2:
-    restart: always
     image: opensearchproject/opensearch:2
     hostname: opensearch-node2.local
     environment:
@@ -141,10 +157,9 @@ services:
     volumes:
       - opensearch-data2:/usr/share/opensearch/data
     deploy:
-      replicas: 1
+      <<: *deploy
 
   opensearch-node3:
-    restart: always
     image: opensearchproject/opensearch:2
     hostname: opensearch-node3.local
     environment:
@@ -166,7 +181,7 @@ services:
     volumes:
       - opensearch-data2:/usr/share/opensearch/data
     deploy:
-      replicas: 1
+      <<: *deploy
 
   opensearch-dashboards:
     image: opensearchproject/opensearch-dashboards:2
@@ -175,10 +190,9 @@ services:
     environment:
       OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200","https://opensearch-node3:9200"]'
     deploy:
-      replicas: 1
+      <<: *deploy
 
   vespa:
-    restart: always
     image: vespaengine/vespa:8
     hostname: vespa.local
     healthcheck:
@@ -195,14 +209,12 @@ services:
       replicas: 0
 
   rabbit:
-    restart: always
     deploy:
-      replicas: 1
+      <<: *deploy
 
   redis:
-    restart: always
     deploy:
-      replicas: 1
+      <<: *deploy
 
 volumes:
   models_data: