Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions ChatQnA/Dockerfile.wrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev \
git

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

WORKDIR /home/user/
RUN git clone https://github.com/opea-project/GenAIComps.git

WORKDIR /home/user/GenAIComps
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt

COPY ./chatqna_wrapper.py /home/user/chatqna.py

ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps

USER user

WORKDIR /home/user

RUN echo 'ulimit -S -n 999999' >> ~/.bashrc

ENTRYPOINT ["python", "chatqna.py"]
68 changes: 68 additions & 0 deletions ChatQnA/chatqna_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType

MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))


class ChatQnAService:
def __init__(self, host="0.0.0.0", port=8000):
self.host = host
self.port = port
self.megaservice = ServiceOrchestrator()

def add_remote_service(self):
embedding = MicroService(
name="embedding",
host=EMBEDDING_SERVICE_HOST_IP,
port=EMBEDDING_SERVICE_PORT,
endpoint="/v1/embeddings",
use_remote_service=True,
service_type=ServiceType.EMBEDDING,
)
retriever = MicroService(
name="retriever",
host=RETRIEVER_SERVICE_HOST_IP,
port=RETRIEVER_SERVICE_PORT,
endpoint="/v1/retrieval",
use_remote_service=True,
service_type=ServiceType.RETRIEVER,
)
rerank = MicroService(
name="rerank",
host=RERANK_SERVICE_HOST_IP,
port=RERANK_SERVICE_PORT,
endpoint="/v1/reranking",
use_remote_service=True,
service_type=ServiceType.RERANK,
)
llm = MicroService(
name="llm",
host=LLM_SERVICE_HOST_IP,
port=LLM_SERVICE_PORT,
endpoint="/v1/chat/completions",
use_remote_service=True,
service_type=ServiceType.LLM,
)
self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
self.megaservice.flow_to(embedding, retriever)
self.megaservice.flow_to(retriever, rerank)
self.megaservice.flow_to(rerank, llm)
self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)


if __name__ == "__main__":
chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
chatqna.add_remote_service()
6 changes: 6 additions & 0 deletions ChatQnA/docker_image_build/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ services:
context: ../
dockerfile: ./Dockerfile
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
chatqna-wrapper:
build:
context: ../
dockerfile: ./Dockerfile.wrapper
extends: chatqna
image: ${REGISTRY:-opea}/chatqna-wrapper:${TAG:-latest}
chatqna-guardrails:
build:
context: ../
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1184,13 +1184,8 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
<<<<<<< HEAD
image: "ghcr.io/huggingface/tgi-gaudi:2.0.6"
imagePullPolicy: IfNotPresent
=======
image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
imagePullPolicy: Always
>>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3
volumeMounts:
- mountPath: /data
name: model-volume
Expand Down
113 changes: 65 additions & 48 deletions ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ metadata:
app.kubernetes.io/managed-by: Helm
data:
TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
http_proxy: ""
https_proxy: ""
no_proxy: ""
Expand Down Expand Up @@ -70,9 +71,8 @@ data:
no_proxy: ""
LOGFLAG: ""
vLLM_ENDPOINT: "http://chatqna-vllm"
HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
LLM_MODEL: "meta-llama/Llama-3.1-70B-Instruct"
MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
LLM_MODEL: "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
---
# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
# Copyright (C) 2024 Intel Corporation
Expand Down Expand Up @@ -145,7 +145,6 @@ data:
NUMBA_CACHE_DIR: "/tmp"
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
HF_HOME: "/tmp/.cache/huggingface"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
---
# Source: chatqna/charts/teirerank/templates/configmap.yaml
# Copyright (C) 2024 Intel Corporation
Expand All @@ -170,6 +169,7 @@ data:
NUMBA_CACHE_DIR: "/tmp"
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
HF_HOME: "/tmp/.cache/huggingface"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
---
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
Expand All @@ -183,7 +183,7 @@ metadata:
app.kubernetes.io/instance: chatqna
app.kubernetes.io/version: "2.1.0"
data:
MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
PORT: "2080"
HF_TOKEN: "insert-your-huggingface-token-here"
http_proxy: ""
Expand All @@ -194,6 +194,12 @@ data:
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
HF_HOME: "/tmp/.cache/huggingface"
GPU_MEMORY_UTILIZATION: "0.5"
DTYPE: "auto"
TENSOR_PARALLEL_SIZE: "1"
BLOCK_SIZE: "128"
MAX_NUM_SEQS: "256"
MAX_SEQ_LEN_TO_CAPTURE: "2048"
---
# Source: chatqna/templates/nginx-deployment.yaml
apiVersion: v1
Expand Down Expand Up @@ -649,7 +655,7 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/dataprep-redis:v0.9"
image: "opea/dataprep-redis:latest"
imagePullPolicy: Always
ports:
- name: data-prep
Expand Down Expand Up @@ -1103,10 +1109,8 @@ spec:
- configMapRef:
name: chatqna-tei-config
securityContext:
privileged: true
capabilities:
add: ["SYS_NICE"]
image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
{}
image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
imagePullPolicy: IfNotPresent
args:
- "--auto-truncate"
Expand Down Expand Up @@ -1140,16 +1144,8 @@ spec:
initialDelaySeconds: 5
periodSeconds: 5
resources:
limits:
habana.ai/gaudi: 1
cpu: 10
memory: 100Gi
hugepages-2Mi: 9800Mi
requests:
habana.ai/gaudi: 1
cpu: 10
memory: 100Gi
hugepages-2Mi: 9800Mi
{}

volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
Expand Down Expand Up @@ -1191,11 +1187,17 @@ spec:
- configMapRef:
name: chatqna-teirerank-config
securityContext:
{}
image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
imagePullPolicy: Always
args:
- "--auto-truncate"
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: false
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
name: model-volume
Expand Down Expand Up @@ -1228,7 +1230,8 @@ spec:
initialDelaySeconds: 5
periodSeconds: 5
resources:
{}
limits:
habana.ai/gaudi: 1
volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
Expand All @@ -1242,6 +1245,7 @@ spec:
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


apiVersion: apps/v1
kind: Deployment
metadata:
Expand Down Expand Up @@ -1271,16 +1275,36 @@ spec:
- configMapRef:
name: chatqna-vllm-config
securityContext:
privileged: true
allowPrivilegeEscalation: false
capabilities:
add: ["SYS_NICE"]
image: "opea/llm-vllm-hpu:latest"
command:
- /bin/bash
- -c
- |
export VLLM_CPU_KVCACHE_SPACE=40 && \
python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/vllm-hpu:latest"
args:
- "--enforce-eager"
- "--model"
- "$(MODEL_ID)"
- "--tensor-parallel-size"
- "1"
- "--gpu-memory-utilization"
- "$(GPU_MEMORY_UTILIZATION)"
- "--dtype"
- "$(DTYPE)"
- "--max-num-seqs"
- "$(MAX_NUM_SEQS)"
- "--block-size"
- "$(BLOCK_SIZE)"
- "--max-seq-len-to-capture"
- "$(MAX_SEQ_LEN_TO_CAPTURE)"
- "--host"
- "0.0.0.0"
- "--port"
- "$(PORT)"
imagePullPolicy: Always
volumeMounts:
- mountPath: /data
Expand All @@ -1293,20 +1317,13 @@ spec:
protocol: TCP
resources:
limits:
habana.ai/gaudi: 8
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
requests:
habana.ai/gaudi: 8
cpu: 40
memory: 400Gi
hugepages-2Mi: 9800Mi
habana.ai/gaudi: 1
volumes:
- name: model-volume # Replace with Persistent volume claim/ host directory
emptyDir: {}
- name: tmp
emptyDir: {}

---
# Source: chatqna/templates/deployment.yaml
# Copyright (C) 2024 Intel Corporation
Expand Down Expand Up @@ -1350,8 +1367,8 @@ spec:
value: chatqna-retriever-usvc
- name: EMBEDDING_SERVICE_HOST_IP
value: chatqna-embedding-usvc
- name: GUARDRAIL_SERVICE_HOST_IP
value: chatqna-guardrails-usvc
- name: MODEL_ID
value: "meta-llama/Meta-Llama-3-8B-Instruct"
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand All @@ -1362,8 +1379,8 @@ spec:
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
image: "opea/chatqna:latest"
imagePullPolicy: Always
image: "opea/chatqna-wrapper:latest"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /tmp
name: tmp
Expand Down
Loading
Loading