Skip to content

Commit cfcac3f

Browse files
XinyaoWachensuyue
andauthored
Fix vLLM and vLLM-on-Ray UT bug (#580)
Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
1 parent d68be05 commit cfcac3f

8 files changed

+37
-34
lines changed

ChatQnA/docker/gaudi/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,9 @@ export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
173173
export LLM_MODEL_ID_NAME="neural-chat-7b-v3-3"
174174
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
175175
export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
176-
export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
177-
export vLLM_LLM_ENDPOINT="http://${host_ip}:8008"
178-
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8008"
176+
export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
177+
export vLLM_LLM_ENDPOINT="http://${host_ip}:8007"
178+
export vLLM_RAY_LLM_ENDPOINT="http://${host_ip}:8006"
179179
export LLM_SERVICE_PORT=9000
180180
export REDIS_URL="redis://${host_ip}:6379"
181181
export INDEX_NAME="rag-redis"
@@ -296,15 +296,15 @@ curl http://${host_ip}:8000/v1/reranking \
296296

297297
```bash
298298
#TGI Service
299-
curl http://${host_ip}:8008/generate \
299+
curl http://${host_ip}:8005/generate \
300300
-X POST \
301301
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
302302
-H 'Content-Type: application/json'
303303
```
304304

305305
```bash
306306
#vLLM Service
307-
curl http://${host_ip}:8008/v1/completions \
307+
curl http://${host_ip}:8007/v1/completions \
308308
-H "Content-Type: application/json" \
309309
-d '{
310310
"model": "${LLM_MODEL_ID}",
@@ -316,7 +316,7 @@ curl http://${host_ip}:8008/v1/completions \
316316

317317
```bash
318318
#vLLM-on-Ray Service
319-
curl http://${host_ip}:8008/v1/chat/completions \
319+
curl http://${host_ip}:8006/v1/chat/completions \
320320
-H "Content-Type: application/json" \
321321
-d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
322322
```

ChatQnA/docker/gaudi/compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ services:
114114
image: ghcr.io/huggingface/tgi-gaudi:2.0.1
115115
container_name: tgi-gaudi-server
116116
ports:
117-
- "8008:80"
117+
- "8005:80"
118118
volumes:
119119
- "./data:/data"
120120
environment:

ChatQnA/docker/gaudi/compose_vllm.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ services:
112112
image: opea/llm-vllm-hpu:latest
113113
container_name: vllm-gaudi-server
114114
ports:
115-
- "8008:80"
115+
- "8007:80"
116116
volumes:
117117
- "./data:/data"
118118
environment:
@@ -122,12 +122,12 @@ services:
122122
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
123123
HABANA_VISIBLE_DEVICES: all
124124
OMPI_MCA_btl_vader_single_copy_mechanism: none
125-
LLM_MODEL: ${LLM_MODEL_ID}
125+
LLM_MODEL_ID: ${LLM_MODEL_ID}
126126
runtime: habana
127127
cap_add:
128128
- SYS_NICE
129129
ipc: host
130-
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
130+
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
131131
llm:
132132
image: opea/llm-vllm:latest
133133
container_name: llm-vllm-gaudi-server

ChatQnA/docker/gaudi/compose_vllm_ray.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ services:
112112
image: opea/llm-vllm-ray-hpu:latest
113113
container_name: vllm-ray-gaudi-server
114114
ports:
115-
- "8008:8000"
115+
- "8006:8000"
116116
volumes:
117117
- "./data:/data"
118118
environment:
@@ -122,12 +122,12 @@ services:
122122
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
123123
HABANA_VISIBLE_DEVICES: all
124124
OMPI_MCA_btl_vader_single_copy_mechanism: none
125-
LLM_MODEL: ${LLM_MODEL_ID}
125+
LLM_MODEL_ID: ${LLM_MODEL_ID}
126126
runtime: habana
127127
cap_add:
128128
- SYS_NICE
129129
ipc: host
130-
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager True"
130+
command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
131131
llm:
132132
image: opea/llm-vllm-ray:latest
133133
container_name: llm-vllm-ray-gaudi-server

ChatQnA/tests/test_chatqna_on_gaudi.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ function start_services() {
5050
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
5151
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
5252
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
53-
export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
53+
export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
5454
export REDIS_URL="redis://${ip_address}:6379"
5555
export REDIS_HOST=${ip_address}
5656
export INDEX_NAME="rag-redis"
@@ -215,7 +215,7 @@ function validate_microservices() {
215215

216216
# tgi for llm service
217217
validate_service \
218-
"${ip_address}:8008/generate" \
218+
"${ip_address}:8005/generate" \
219219
"generated_text" \
220220
"tgi-llm" \
221221
"tgi-gaudi-server" \

ChatQnA/tests/_test_chatqna_vllm_on_gaudi.sh renamed to ChatQnA/tests/test_chatqna_vllm_on_gaudi.sh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ function start_services() {
5050
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
5151
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
5252
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
53-
export vLLM_LLM_ENDPOINT="http://${ip_address}:8008"
53+
export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
54+
export LLM_SERVICE_PORT=9000
5455
export REDIS_URL="redis://${ip_address}:6379"
5556
export INDEX_NAME="rag-redis"
5657
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
@@ -79,12 +80,13 @@ function start_services() {
7980
# Start Docker Containers
8081
docker compose -f compose_vllm.yaml up -d
8182
n=0
82-
until [[ "$n" -ge 180 ]]; do
83+
until [[ "$n" -ge 25 ]]; do
84+
echo "n=$n"
8385
docker logs vllm-gaudi-server > vllm_service_start.log
84-
if grep -q Connected vllm_service_start.log; then
86+
if grep -q "Warmup finished" vllm_service_start.log; then
8587
break
8688
fi
87-
sleep 1s
89+
sleep 20s
8890
n=$((n+1))
8991
done
9092
}
@@ -165,7 +167,7 @@ function validate_microservices() {
165167

166168
# vllm for llm service
167169
validate_services \
168-
"${ip_address}:8008/v1/completions" \
170+
"${ip_address}:8007/v1/completions" \
169171
"text" \
170172
"vllm-llm" \
171173
"vllm-gaudi-server" \
@@ -185,7 +187,7 @@ function validate_megaservice() {
185187
# Curl the Mega Service
186188
validate_services \
187189
"${ip_address}:8888/v1/chatqna" \
188-
"billion" \
190+
"data:" \
189191
"mega-chatqna" \
190192
"chatqna-gaudi-backend-server" \
191193
'{"messages": "What is the revenue of Nike in 2023?"}'

ChatQnA/tests/_test_chatqna_vllm_on_xeon.sh renamed to ChatQnA/tests/test_chatqna_vllm_on_xeon.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,15 @@ function build_docker_images() {
2626
cd $WORKPATH/docker/ui
2727
docker build --no-cache -t opea/chatqna-ui:latest -f docker/Dockerfile .
2828

29-
# cd $WORKPATH
30-
# git clone https://github.com/vllm-project/vllm.git
31-
# cd vllm
32-
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
29+
# cd $WORKPATH
30+
# git clone https://github.com/vllm-project/vllm.git
31+
# cd vllm
32+
# docker build --no-cache -t opea/vllm:latest -f Dockerfile.cpu .
3333

3434
docker images
3535
}
3636

3737
function start_services() {
38-
# build vllm for each test instead of pull from local registry
3938
cd $WORKPATH
4039
git clone https://github.com/vllm-project/vllm.git
4140
cd vllm
@@ -73,18 +72,19 @@ function start_services() {
7372
sed -i "s#image: opea/chatqna-ui:latest#image: opea/chatqna-ui:${IMAGE_TAG}#g" compose_vllm.yaml
7473
sed -i "s#image: opea/chatqna-conversation-ui:latest#image: opea/chatqna-conversation-ui:${IMAGE_TAG}#g" compose_vllm.yaml
7574
sed -i "s#image: opea/*#image: ${IMAGE_REPO}opea/#g" compose_vllm.yaml
75+
sed -i "s#image: ${IMAGE_REPO}opea/vllm:latest#image: opea/vllm:latest#g" compose_vllm.yaml
7676
fi
7777
fi
7878

7979
# Start Docker Containers
8080
docker compose -f compose_vllm.yaml up -d
8181
n=0
82-
until [[ "$n" -ge 100 ]]; do
82+
until [[ "$n" -ge 10 ]]; do
8383
docker logs vllm-service > ${LOG_PATH}/vllm_service_start.log
8484
if grep -q Connected ${LOG_PATH}/vllm_service_start.log; then
8585
break
8686
fi
87-
sleep 1s
87+
sleep 10s
8888
n=$((n+1))
8989
done
9090
}
@@ -185,7 +185,7 @@ function validate_megaservice() {
185185
# Curl the Mega Service
186186
validate_services \
187187
"${ip_address}:8888/v1/chatqna" \
188-
"billion" \
188+
"data" \
189189
"mega-chatqna" \
190190
"chatqna-xeon-backend-server" \
191191
'{"messages": "What is the revenue of Nike in 2023?"}'

ChatQnA/tests/_test_chatqna_vllm_ray_on_gaudi.sh renamed to ChatQnA/tests/test_chatqna_vllm_ray_on_gaudi.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ function start_services() {
5050
export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
5151
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
5252
export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
53-
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8008"
53+
export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
5454
export LLM_SERVICE_PORT=9000
5555
export REDIS_URL="redis://${ip_address}:6379"
5656
export INDEX_NAME="rag-redis"
@@ -80,12 +80,13 @@ function start_services() {
8080
# Start Docker Containers
8181
docker compose -f compose_vllm_ray.yaml up -d
8282
n=0
83-
until [[ "$n" -ge 400 ]]; do
83+
until [[ "$n" -ge 25 ]]; do
84+
echo "n=$n"
8485
docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
85-
if grep -q Connected vllm_ray_service_start.log; then
86+
if grep -q "Warmup finished" vllm_ray_service_start.log; then
8687
break
8788
fi
88-
sleep 1s
89+
sleep 20s
8990
n=$((n+1))
9091
done
9192
}
@@ -166,7 +167,7 @@ function validate_microservices() {
166167

167168
# vllm-on-ray for llm service
168169
validate_services \
169-
"${ip_address}:8008/v1/chat/completions" \
170+
"${ip_address}:8006/v1/chat/completions" \
170171
"content" \
171172
"vllm-ray-llm" \
172173
"vllm-ray-gaudi-server" \

0 commit comments

Comments
 (0)