From dbcf3c789d17df99c8c25fb1694182b402a0900c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20de=20=C3=81vila=20Martins?= Date: Sun, 31 Mar 2024 23:58:40 -0300 Subject: [PATCH] raw vllm backend (#75) --- docker/runpod-serverless-vllm/README.md | 4 +++- docker/vllm/README.md | 1 + docker/vllm/docker-compose.yml | 17 +++++++++++++++++ examples/client/README.md | 2 ++ examples/client/src/vllm-runtime.ts | 2 +- 5 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 docker/vllm/README.md create mode 100644 docker/vllm/docker-compose.yml diff --git a/docker/runpod-serverless-vllm/README.md b/docker/runpod-serverless-vllm/README.md index 964c826..822cb58 100644 --- a/docker/runpod-serverless-vllm/README.md +++ b/docker/runpod-serverless-vllm/README.md @@ -20,5 +20,7 @@ It was built following Runpod's example: https://github.com/runpod-workers/worker-vllm/tree/main?tab=readme-ov-file#example-building-an-image-with-openchat-35 ``` -docker build -t degroote22/lmscript-runpod-serverless-vllm:0.0.2 --build-arg MODEL_NAME="mistralai/Mistral-7B-Instruct-v0.2" --build-arg WORKER_CUDA_VERSION=12.1.0 . +docker build -t degroote22/lmscript-runpod-serverless-vllm:0.0.3 --build-arg MODEL_NAME="TheBloke/Mistral-7B-Instruct-v0.2-AWQ" --build-arg WORKER_CUDA_VERSION=12.1.0 . + +docker push degroote22/lmscript-runpod-serverless-vllm:0.0.3 ``` diff --git a/docker/vllm/README.md b/docker/vllm/README.md new file mode 100644 index 0000000..1333ed7 --- /dev/null +++ b/docker/vllm/README.md @@ -0,0 +1 @@ +TODO diff --git a/docker/vllm/docker-compose.yml b/docker/vllm/docker-compose.yml new file mode 100644 index 0000000..3dd2898 --- /dev/null +++ b/docker/vllm/docker-compose.yml @@ -0,0 +1,17 @@ +services: + sv: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + image: vllm/vllm-openai:v0.4.0 + command: "--model TheBloke/Mistral-7B-Instruct-v0.2-AWQ --gpu-memory-utilization 0.8 --max-model-len 4096 --quantization awq" + ports: + - 8000:8000 + network_mode: host + ipc: "host" + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface \ No newline at end of file diff --git a/examples/client/README.md b/examples/client/README.md index 9061d65..67622f1 100644 --- a/examples/client/README.md +++ b/examples/client/README.md @@ -9,3 +9,5 @@ Running all examples tests all features of the client. - Executing the SGLang tests, local server `npx tsx src/sg-runtime.ts` + + `npx tsx src/vllm-runtime.ts` diff --git a/examples/client/src/vllm-runtime.ts b/examples/client/src/vllm-runtime.ts index f094461..70cdad3 100644 --- a/examples/client/src/vllm-runtime.ts +++ b/examples/client/src/vllm-runtime.ts @@ -7,7 +7,7 @@ const bench = async () => { let completionTokens = 0; const backend = new VllmBackend({ url: `http://localhost:8000`, - model: "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ", + model: "TheBloke/Mistral-7B-Instruct-v0.2-AWQ", reportUsage: ({ promptTokens: pt, completionTokens: ct }) => { promptTokens += pt; completionTokens += ct;