raw vllm backend (#75)

lucasavila00 · Apr 1, 2024 · dbcf3c7 · dbcf3c7
1 parent ac2567b
commit dbcf3c7
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 2 deletions.
diff --git a/docker/runpod-serverless-vllm/README.md b/docker/runpod-serverless-vllm/README.md
@@ -20,5 +20,7 @@ It was built following Runpod's example:
 https://github.com/runpod-workers/worker-vllm/tree/main?tab=readme-ov-file#example-building-an-image-with-openchat-35
 
 ```
-docker build -t degroote22/lmscript-runpod-serverless-vllm:0.0.2 --build-arg MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.2" --build-arg WORKER_CUDA_VERSION=12.1.0 .
+docker build -t degroote22/lmscript-runpod-serverless-vllm:0.0.3 --build-arg MODEL_NAME="TheBloke/Mistral-7B-Instruct-v0.2-AWQ" --build-arg WORKER_CUDA_VERSION=12.1.0 .
+
+docker push degroote22/lmscript-runpod-serverless-vllm:0.0.3
 ```
diff --git a/docker/vllm/README.md b/docker/vllm/README.md
@@ -0,0 +1 @@
+TODO
diff --git a/docker/vllm/docker-compose.yml b/docker/vllm/docker-compose.yml
@@ -0,0 +1,17 @@
+services:
+  sv:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    image: vllm/vllm-openai:v0.4.0
+    command: "--model TheBloke/Mistral-7B-Instruct-v0.2-AWQ --gpu-memory-utilization 0.8 --max-model-len 4096 --quantization awq"
+    ports:
+      - 8000:8000
+    network_mode: host
+    ipc: "host"
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
diff --git a/examples/client/README.md b/examples/client/README.md
@@ -9,3 +9,5 @@ Running all examples tests all features of the client.
 - Executing the SGLang tests, local server
 
   `npx tsx src/sg-runtime.ts`
+
+  `npx tsx src/vllm-runtime.ts`
diff --git a/examples/client/src/vllm-runtime.ts b/examples/client/src/vllm-runtime.ts
@@ -7,7 +7,7 @@ const bench = async () => {
   let completionTokens = 0;
   const backend = new VllmBackend({
     url: `http://localhost:8000`,
-    model: "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ",
+    model: "TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
     reportUsage: ({ promptTokens: pt, completionTokens: ct }) => {
       promptTokens += pt;
       completionTokens += ct;