From 0e9830087a8ebf93b7cc06cfaa677b1b12225819 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 18 Feb 2025 19:46:23 +0000
Subject: [PATCH 1/4] allow passing k and top_n to retrieval megaservice

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../intel/cpu/xeon/compose.yaml               |  6 ++
 DocIndexRetriever/retrieval_tool.py           | 56 ++++++++++++-------
 DocIndexRetriever/tests/test.py               | 42 ++++++++++++++
 3 files changed, 85 insertions(+), 19 deletions(-)
 create mode 100644 DocIndexRetriever/tests/test.py

diff --git a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
index d4bfe0446f..9624df7300 100644
--- a/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocIndexRetriever/docker_compose/intel/cpu/xeon/compose.yaml
@@ -13,6 +13,8 @@ services:
   dataprep-redis-service:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-redis-server
+    # volumes:
+    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
     depends_on:
       - redis-vector-db
     ports:
@@ -52,6 +54,8 @@ services:
   embedding:
     image: ${REGISTRY:-opea}/embedding:${TAG:-latest}
     container_name: embedding-server
+    # volumes:
+    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/comps
     ports:
       - "6000:6000"
     ipc: host
@@ -110,6 +114,8 @@ services:
   reranking:
     image: ${REGISTRY:-opea}/reranking:${TAG:-latest}
     container_name: reranking-tei-xeon-server
+    # volumes:
+    #   - $WORKDIR/GenAIExamples/DocIndexRetriever/docker_image_build/GenAIComps/comps:/home/user/comps
     depends_on:
       tei-reranking-service:
         condition: service_healthy
diff --git a/DocIndexRetriever/retrieval_tool.py b/DocIndexRetriever/retrieval_tool.py
index b627f45537..26a7759251 100644
--- a/DocIndexRetriever/retrieval_tool.py
+++ b/DocIndexRetriever/retrieval_tool.py
@@ -22,16 +22,38 @@
 
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
-    if self.services[cur_node].service_type == ServiceType.EMBEDDING:
-        inputs["input"] = inputs["text"]
-        del inputs["text"]
+    print(f"Inputs to {cur_node}: {inputs}")
+    for key, value in kwargs.items():
+        print(f"{key}: {value}")
     return inputs
 
 
 def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs):
     next_data = {}
     if self.services[cur_node].service_type == ServiceType.EMBEDDING:
-        next_data = {"text": inputs["input"], "embedding": [item["embedding"] for item in data["data"]]}
+        # turn into chat completion request
+        # next_data = {"text": inputs["input"], "embedding": [item["embedding"] for item in data["data"]]}
+        print("Assembing output from Embedding for next node...")
+        print("Inputs to Embedding: ", inputs)
+        print("Keyword arguments: ")
+        for key, value in kwargs.items():
+            print(f"{key}: {value}")
+
+        next_data = {
+            "input": inputs["input"],
+            "messages": inputs["input"],
+            "embedding": data, #[item["embedding"] for item in data["data"]],
+            "k": kwargs["k"] if "k" in kwargs else 4,
+            "search_type": kwargs["search_type"] if "search_type" in kwargs else "similarity",
+            "distance_threshold": kwargs["distance_threshold"] if "distance_threshold" in kwargs else None,
+            "fetch_k": kwargs["fetch_k"] if "fetch_k" in kwargs else 20,
+            "lambda_mult": kwargs["lambda_mult"] if "lambda_mult" in kwargs else 0.5,
+            "score_threshold": kwargs["score_threshold"] if "score_threshold" in kwargs else 0.2,
+            "top_n": kwargs["top_n"] if "top_n" in kwargs else 1,
+        }
+
+        print("Output from Embedding for next node:\n", next_data)
+
     else:
         next_data = data
 
@@ -99,18 +121,6 @@ def parser_input(data, TypeClass, key):
             raise ValueError(f"Unknown request type: {data}")
 
         if isinstance(chat_request, ChatCompletionRequest):
-            retriever_parameters = RetrieverParms(
-                search_type=chat_request.search_type if chat_request.search_type else "similarity",
-                k=chat_request.k if chat_request.k else 4,
-                distance_threshold=chat_request.distance_threshold if chat_request.distance_threshold else None,
-                fetch_k=chat_request.fetch_k if chat_request.fetch_k else 20,
-                lambda_mult=chat_request.lambda_mult if chat_request.lambda_mult else 0.5,
-                score_threshold=chat_request.score_threshold if chat_request.score_threshold else 0.2,
-            )
-            reranker_parameters = RerankerParms(
-                top_n=chat_request.top_n if chat_request.top_n else 1,
-            )
-
             initial_inputs = {
                 "messages": query,
                 "input": query,  # has to be input due to embedding expects either input or text
@@ -123,13 +133,21 @@ def parser_input(data, TypeClass, key):
                 "top_n": chat_request.top_n if chat_request.top_n else 1,
             }
 
+            kwargs = {
+                "search_type": chat_request.search_type if chat_request.search_type else "similarity",
+                "k": chat_request.k if chat_request.k else 4,
+                "distance_threshold": chat_request.distance_threshold if chat_request.distance_threshold else None,
+                "fetch_k": chat_request.fetch_k if chat_request.fetch_k else 20,
+                "lambda_mult": chat_request.lambda_mult if chat_request.lambda_mult else 0.5,
+                "score_threshold": chat_request.score_threshold if chat_request.score_threshold else 0.2,
+                "top_n": chat_request.top_n if chat_request.top_n else 1,
+            }
             result_dict, runtime_graph = await self.megaservice.schedule(
                 initial_inputs=initial_inputs,
-                retriever_parameters=retriever_parameters,
-                reranker_parameters=reranker_parameters,
+                **kwargs,
             )
         else:
-            result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs={"text": query})
+            result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs={"input": query})
 
         last_node = runtime_graph.all_leaves()[-1]
         response = result_dict[last_node]
diff --git a/DocIndexRetriever/tests/test.py b/DocIndexRetriever/tests/test.py
new file mode 100644
index 0000000000..e655073ddb
--- /dev/null
+++ b/DocIndexRetriever/tests/test.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import requests
+
+
+def search_knowledge_base(query: str) -> str:
+    """Search the knowledge base for a specific query."""
+    url = os.environ.get("RETRIEVAL_TOOL_URL")
+    print(url)
+    proxies = {"http": ""}
+    payload = {
+        "messages": query,
+        "k":5,
+        "top_n": 2
+    }
+    response = requests.post(url, json=payload, proxies=proxies)
+    print(response)
+    if "documents" in response.json():
+        docs = response.json()["documents"]
+        context = ""
+        for i, doc in enumerate(docs):
+            context += f"Doc[{i+1}]:\n{doc}\n"
+        return context
+    elif "text" in response.json():
+        return response.json()["text"]
+    elif "reranked_docs" in response.json():
+        docs = response.json()["reranked_docs"]
+        context = ""
+        for i, doc in enumerate(docs):
+            context += f"Doc[{i+1}]:\n{doc}\n"
+        return context
+    else:
+        return "Error parsing response from the knowledge base."
+
+
+if __name__ == "__main__":
+    resp = search_knowledge_base("What is OPEA?")
+    # resp = search_knowledge_base("Thriller")
+    print(resp)
\ No newline at end of file

From 2e0ee2cf5f570cb0b6358c7582557997f4c43c98 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 18 Feb 2025 22:28:15 +0000
Subject: [PATCH 2/4] update agent example

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 AgentQnA/README.md                            | 44 ++++-------
 .../intel/cpu/xeon/compose_openai.yaml        | 11 ++-
 .../cpu/xeon/launch_agent_service_openai.sh   |  4 +-
 .../intel/hpu/gaudi/compose.yaml              |  5 +-
 .../hpu/gaudi/launch_agent_service_gaudi.sh   |  2 +-
 ... step4_launch_and_validate_agent_gaudi.sh} | 56 ++++---------
 AgentQnA/tests/test.py                        | 79 ++++++++++++-------
 AgentQnA/tests/test_compose_on_gaudi.sh       |  2 +-
 8 files changed, 96 insertions(+), 107 deletions(-)
 rename AgentQnA/tests/{step4_launch_and_validate_agent_tgi.sh => step4_launch_and_validate_agent_gaudi.sh} (84%)

diff --git a/AgentQnA/README.md b/AgentQnA/README.md
index d45b14ef55..8e77f2f1a6 100644
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -84,7 +84,7 @@ flowchart LR
 3. Hierarchical multi-agents can improve performance.
    Expert worker agents, such as RAG agent and SQL agent, can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. If we only use one agent and provide all the tools to this single agent, it may get overwhelmed and not able to provide accurate answers.
 
-## Deployment with docker
+## Deploy with docker
 
 1. Build agent docker image [Optional]
 
@@ -217,13 +217,19 @@ docker build -t opea/agent:latest --build-arg https_proxy=$https_proxy --build-a
    :::
    ::::
 
+## Deploy AgentQnA UI
+
+The AgentQnA UI can be deployed locally or using Docker.
+
+For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
+
 ## Deploy using Helm Chart
 
 Refer to the [AgentQnA helm chart](./kubernetes/helm/README.md) for instructions on deploying AgentQnA on Kubernetes.
 
 ## Validate services
 
-First look at logs of the agent docker containers:
+1. First look at logs of the agent docker containers:
 
 ```
 # worker RAG agent
@@ -240,35 +246,17 @@ docker logs react-agent-endpoint
 
 You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
 
-Second, validate worker RAG agent:
-
-```
-curl http://${host_ip}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "Michael Jackson song Thriller"
-    }'
-```
-
-Third, validate worker SQL agent:
+2. You can use python to validate the agent system
+```bash
+# RAG worker agent
+python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
 
-```
-curl http://${host_ip}:9096/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many employees are in the company"
-    }'
-```
-
-Finally, validate supervisor agent:
+# SQL agent
+python tests/test.py --prompt "How many employees in company" --agent_role "worker" --ext_port 9096
 
+# supervisor agent: this will test a two-turn conversation
+python tests/test.py --agent_role "supervisor" --ext_port 9090
 ```
-curl http://${host_ip}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
-     "messages": "How many albums does Iron Maiden have?"
-    }'
-```
-
-## Deploy AgentQnA UI
-
-The AgentQnA UI can be deployed locally or using Docker.
-
-For detailed instructions on deploying AgentQnA UI, refer to the [AgentQnA UI Guide](./ui/svelte/README.md).
 
 ## How to register your own tools with agent
 
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
index 09bde26bde..bbd64ceb30 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml
@@ -13,6 +13,7 @@ services:
     environment:
       ip_address: ${ip_address}
       strategy: rag_agent
+      with_memory: false
       recursion_limit: ${recursion_limit_worker}
       llm_engine: openai
       OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -35,17 +36,17 @@ services:
     image: opea/agent:latest
     container_name: sql-agent-endpoint
     volumes:
-      - ${WORKDIR}/TAG-Bench/:/home/user/TAG-Bench # SQL database
+      - ${WORKDIR}/GenAIExamples/AgentQnA/tests:/home/user/chinook-db # SQL database
     ports:
       - "9096:9096"
     ipc: host
     environment:
       ip_address: ${ip_address}
       strategy: sql_agent
+      with_memory: false
       db_name: ${db_name}
       db_path: ${db_path}
       use_hints: false
-      hints_file: /home/user/TAG-Bench/${db_name}_hints.csv
       recursion_limit: ${recursion_limit_worker}
       llm_engine: openai
       OPENAI_API_KEY: ${OPENAI_API_KEY}
@@ -64,6 +65,7 @@ services:
     container_name: react-agent-endpoint
     depends_on:
       - worker-rag-agent
+      - worker-sql-agent
     volumes:
       - ${TOOLSET_PATH}:/home/user/tools/
     ports:
@@ -71,14 +73,15 @@ services:
     ipc: host
     environment:
       ip_address: ${ip_address}
-      strategy: react_langgraph
+      strategy: react_llama
+      with_memory: true
       recursion_limit: ${recursion_limit_supervisor}
       llm_engine: openai
       OPENAI_API_KEY: ${OPENAI_API_KEY}
       model: ${model}
       temperature: ${temperature}
       max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
       tools: /home/user/tools/supervisor_agent_tools.yaml
       require_human_feedback: false
       no_proxy: ${no_proxy}
diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
index 7b4e86a781..2455865f27 100644
--- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
+++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh
@@ -16,7 +16,7 @@ export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions"
 export SQL_AGENT_URL="http://${ip_address}:9096/v1/chat/completions"
 export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool"
 export CRAG_SERVER=http://${ip_address}:8080
-export db_name=california_schools
-export db_path="sqlite:////home/user/TAG-Bench/dev_folder/dev_databases/${db_name}/${db_name}.sqlite"
+export db_name=Chinook
+export db_path="sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite"
 
 docker compose -f compose_openai.yaml up -d
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 4895722c93..c14d58c10b 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -13,6 +13,7 @@ services:
     environment:
       ip_address: ${ip_address}
       strategy: rag_agent_llama
+      with_memory: false
       recursion_limit: ${recursion_limit_worker}
       llm_engine: vllm
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -43,6 +44,7 @@ services:
     environment:
       ip_address: ${ip_address}
       strategy: sql_agent_llama
+      with_memory: false
       db_name: ${db_name}
       db_path: ${db_path}
       use_hints: false
@@ -74,6 +76,7 @@ services:
     environment:
       ip_address: ${ip_address}
       strategy: react_llama
+      with_memory: true
       recursion_limit: ${recursion_limit_supervisor}
       llm_engine: vllm
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
@@ -81,7 +84,7 @@ services:
       model: ${LLM_MODEL_ID}
       temperature: ${temperature}
       max_new_tokens: ${max_new_tokens}
-      stream: false
+      stream: true
       tools: /home/user/tools/supervisor_agent_tools.yaml
       require_human_feedback: false
       no_proxy: ${no_proxy}
diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
index fff5d53f8d..298feee3fd 100644
--- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
+++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_gaudi.sh
@@ -14,7 +14,7 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 ls $HF_CACHE_DIR
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct"
+export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
 export NUM_SHARDS=4
 export LLM_ENDPOINT_URL="http://${ip_address}:8086"
 export temperature=0
diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
similarity index 84%
rename from AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
rename to AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
index 824f7aa855..7c3800ba78 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -11,15 +11,15 @@ export ip_address=$(hostname -I | awk '{print $1}')
 export TOOLSET_PATH=$WORKPATH/tools/
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
 
-export HF_CACHE_DIR=/data2/huggingface
-if [ ! -d "$HF_CACHE_DIR" ]; then
-    HF_CACHE_DIR=$WORKDIR/hf_cache
-    mkdir -p "$HF_CACHE_DIR"
-fi
-echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
-ls $HF_CACHE_DIR
+# export HF_CACHE_DIR=/data2/huggingface
+# if [ ! -d "$HF_CACHE_DIR" ]; then
+#     HF_CACHE_DIR=$WORKDIR/hf_cache
+#     mkdir -p "$HF_CACHE_DIR"
+# fi
+# echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
+ls $HF_CACHE_DIR/hub
 
 vllm_port=8086
 vllm_volume=${HF_CACHE_DIR}
@@ -60,23 +60,6 @@ function start_vllm_service_70B() {
     echo "Service started successfully"
 }
 
-
-function prepare_data() {
-    cd $WORKDIR
-
-    echo "Downloading data..."
-    git clone https://github.com/TAG-Research/TAG-Bench.git
-    cd TAG-Bench/setup
-    chmod +x get_dbs.sh
-    ./get_dbs.sh
-
-    echo "Split data..."
-    cd $WORKPATH/tests/sql_agent_test
-    bash run_data_split.sh
-
-    echo "Data preparation done!"
-}
-
 function download_chinook_data(){
     echo "Downloading chinook data..."
     cd $WORKDIR
@@ -113,7 +96,7 @@ function validate_agent_service() {
     echo "======================Testing worker rag agent======================"
     export agent_port="9095"
     prompt="Tell me about Michael Jackson song Thriller"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
     # echo $CONTENT
     local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint")
     echo $EXIT_CODE
@@ -127,7 +110,7 @@ function validate_agent_service() {
     echo "======================Testing worker sql agent======================"
     export agent_port="9096"
     prompt="How many employees are there in the company?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt" --agent_role "worker" --ext_port $agent_port)
     local EXIT_CODE=$(validate "$CONTENT" "8" "sql-agent-endpoint")
     echo $CONTENT
     # echo $EXIT_CODE
@@ -140,9 +123,8 @@ function validate_agent_service() {
     # test supervisor react agent
     echo "======================Testing supervisor react agent======================"
     export agent_port="9090"
-    prompt="How many albums does Iron Maiden have?"
-    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --prompt "$prompt")
-    local EXIT_CODE=$(validate "$CONTENT" "21" "react-agent-endpoint")
+    local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py --agent_role "supervisor" --ext_port $agent_port --stream)
+    local EXIT_CODE=$(validate "$CONTENT" "Iron" "react-agent-endpoint")
     # echo $CONTENT
     echo $EXIT_CODE
     local EXIT_CODE="${EXIT_CODE:0-1}"
@@ -153,15 +135,6 @@ function validate_agent_service() {
 
 }
 
-function remove_data() {
-    echo "Removing data..."
-    cd $WORKDIR
-    if [ -d "TAG-Bench" ]; then
-        rm -rf TAG-Bench
-    fi
-    echo "Data removed!"
-}
-
 function remove_chinook_data(){
     echo "Removing chinook data..."
     cd $WORKDIR
@@ -189,8 +162,9 @@ function main() {
     echo "==================== Agent service validated ===================="
 }
 
-remove_data
+
 remove_chinook_data
+
 main
-remove_data
+
 remove_chinook_data
diff --git a/AgentQnA/tests/test.py b/AgentQnA/tests/test.py
index 400684ffd6..046fcd9209 100644
--- a/AgentQnA/tests/test.py
+++ b/AgentQnA/tests/test.py
@@ -1,34 +1,20 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-import os
+import json
+import uuid
 
 import requests
 
 
-def generate_answer_agent_api(url, prompt):
-    proxies = {"http": ""}
-    payload = {
-        "messages": prompt,
-    }
-    response = requests.post(url, json=payload, proxies=proxies)
-    answer = response.json()["text"]
-    return answer
-
-
 def process_request(url, query, is_stream=False):
     proxies = {"http": ""}
-
-    payload = {
-        "messages": query,
-    }
-
+    content = json.dumps(query) if query is not None else None
     try:
-        resp = requests.post(url=url, json=payload, proxies=proxies, stream=is_stream)
+        resp = requests.post(url=url, data=content, proxies=proxies, stream=is_stream)
         if not is_stream:
             ret = resp.json()["text"]
-            print(ret)
         else:
             for line in resp.iter_lines(decode_unicode=True):
                 print(line)
@@ -38,19 +24,54 @@ def process_request(url, query, is_stream=False):
         return ret
     except requests.exceptions.RequestException as e:
         ret = f"An error occurred:{e}"
-        print(ret)
-        return False
+        return None
+
+def test_worker_agent(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    query = {"role": "user", "messages": args.prompt, "stream": "false"}
+    ret = process_request(url, query)
+    print("Response: ", ret)
+
+
+def add_message_and_run(url, user_message, thread_id, stream=False):
+    print("User message: ", user_message)
+    query = {"role": "user", "messages": user_message, "thread_id": thread_id, "stream": stream}
+    ret = process_request(url, query, is_stream=stream)
+    print("Response: ", ret)
+
+
+def test_chat_completion_multi_turn(args):
+    url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
+    thread_id = f"{uuid.uuid4()}"
+
+    # first turn
+    print("===============First turn==================")
+    user_message = "Which artist has the most albums in the database?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of first turn==================")
+
+    # second turn
+    print("===============Second turn==================")
+    user_message = "Give me a few examples of the artist's albumns?"
+    add_message_and_run(url, user_message, thread_id, stream=args.stream)
+    print("===============End of second turn==================")
+
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str)
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+    parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
+    parser.add_argument("--ext_port", type=str, default="9090", help="endpoint port")
+    parser.add_argument("--stream", action="store_true", help="streaming mode")
+    parser.add_argument("--prompt", type=str, help="prompt message")
+    parser.add_argument("--agent_role", type=str, default="supervisor", help="supervisor or worker")
+    args, _ = parser.parse_known_args()
 
-    ip_address = os.getenv("ip_address", "localhost")
-    agent_port = os.getenv("agent_port", "9090")
-    url = f"http://{ip_address}:{agent_port}/v1/chat/completions"
-    prompt = args.prompt
+    print(args)
 
-    process_request(url, prompt, args.stream)
+    if args.agent_role == "supervisor":
+        test_chat_completion_multi_turn(args)
+    elif args.agent_role == "worker":
+        test_worker_agent(args)
+    else:
+        raise ValueError("Invalid agent role")
\ No newline at end of file
diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh
index de70514ba6..ab0ce295cb 100644
--- a/AgentQnA/tests/test_compose_on_gaudi.sh
+++ b/AgentQnA/tests/test_compose_on_gaudi.sh
@@ -78,7 +78,7 @@ bash step3_ingest_data_and_validate_retrieval.sh
 echo "=================== #3 Data ingestion and validation completed===================="
 
 echo "=================== #4 Start agent and API server===================="
-bash step4_launch_and_validate_agent_tgi.sh
+bash step4_launch_and_validate_agent_gaudi.sh
 echo "=================== #4 Agent test passed ===================="
 
 echo "=================== #5 Stop agent and API server===================="

From d219e028daa4363f7ade4078e49e7e90bb35b9df Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Feb 2025 22:29:57 +0000
Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 AgentQnA/README.md                  | 1 +
 AgentQnA/tests/test.py              | 6 +++---
 DocIndexRetriever/retrieval_tool.py | 2 +-
 DocIndexRetriever/tests/test.py     | 8 ++------
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/AgentQnA/README.md b/AgentQnA/README.md
index 8e77f2f1a6..397bd0c775 100644
--- a/AgentQnA/README.md
+++ b/AgentQnA/README.md
@@ -247,6 +247,7 @@ docker logs react-agent-endpoint
 You should see something like "HTTP server setup successful" if the docker containers are started successfully.</p>
 
 2. You can use python to validate the agent system
+
 ```bash
 # RAG worker agent
 python tests/test.py --prompt "Tell me about Michael Jackson song Thriller" --agent_role "worker" --ext_port 9095
diff --git a/AgentQnA/tests/test.py b/AgentQnA/tests/test.py
index 046fcd9209..18254f16c5 100644
--- a/AgentQnA/tests/test.py
+++ b/AgentQnA/tests/test.py
@@ -26,6 +26,7 @@ def process_request(url, query, is_stream=False):
         ret = f"An error occurred:{e}"
         return None
 
+
 def test_worker_agent(args):
     url = f"http://{args.ip_addr}:{args.ext_port}/v1/chat/completions"
     query = {"role": "user", "messages": args.prompt, "stream": "false"}
@@ -52,12 +53,11 @@ def test_chat_completion_multi_turn(args):
 
     # second turn
     print("===============Second turn==================")
-    user_message = "Give me a few examples of the artist's albumns?"
+    user_message = "Give me a few examples of the artist's albums?"
     add_message_and_run(url, user_message, thread_id, stream=args.stream)
     print("===============End of second turn==================")
 
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--ip_addr", type=str, default="127.0.0.1", help="endpoint ip address")
@@ -74,4 +74,4 @@ def test_chat_completion_multi_turn(args):
     elif args.agent_role == "worker":
         test_worker_agent(args)
     else:
-        raise ValueError("Invalid agent role")
\ No newline at end of file
+        raise ValueError("Invalid agent role")
diff --git a/DocIndexRetriever/retrieval_tool.py b/DocIndexRetriever/retrieval_tool.py
index 26a7759251..99fab7b1b5 100644
--- a/DocIndexRetriever/retrieval_tool.py
+++ b/DocIndexRetriever/retrieval_tool.py
@@ -42,7 +42,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
         next_data = {
             "input": inputs["input"],
             "messages": inputs["input"],
-            "embedding": data, #[item["embedding"] for item in data["data"]],
+            "embedding": data,  # [item["embedding"] for item in data["data"]],
             "k": kwargs["k"] if "k" in kwargs else 4,
             "search_type": kwargs["search_type"] if "search_type" in kwargs else "similarity",
             "distance_threshold": kwargs["distance_threshold"] if "distance_threshold" in kwargs else None,
diff --git a/DocIndexRetriever/tests/test.py b/DocIndexRetriever/tests/test.py
index e655073ddb..ba74827fa6 100644
--- a/DocIndexRetriever/tests/test.py
+++ b/DocIndexRetriever/tests/test.py
@@ -11,11 +11,7 @@ def search_knowledge_base(query: str) -> str:
     url = os.environ.get("RETRIEVAL_TOOL_URL")
     print(url)
     proxies = {"http": ""}
-    payload = {
-        "messages": query,
-        "k":5,
-        "top_n": 2
-    }
+    payload = {"messages": query, "k": 5, "top_n": 2}
     response = requests.post(url, json=payload, proxies=proxies)
     print(response)
     if "documents" in response.json():
@@ -39,4 +35,4 @@ def search_knowledge_base(query: str) -> str:
 if __name__ == "__main__":
     resp = search_knowledge_base("What is OPEA?")
     # resp = search_knowledge_base("Thriller")
-    print(resp)
\ No newline at end of file
+    print(resp)

From 7d10a4d8a8dbb94c9aff3cf3890dd9aee4e0a8a3 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 18 Feb 2025 23:05:14 +0000
Subject: [PATCH 4/4] fix hf cache dir

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../tests/step4_launch_and_validate_agent_gaudi.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
index 7c3800ba78..56f017239b 100644
--- a/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
+++ b/AgentQnA/tests/step4_launch_and_validate_agent_gaudi.sh
@@ -13,13 +13,13 @@ export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 model="meta-llama/Llama-3.3-70B-Instruct" #"meta-llama/Meta-Llama-3.1-70B-Instruct"
 
-# export HF_CACHE_DIR=/data2/huggingface
-# if [ ! -d "$HF_CACHE_DIR" ]; then
-#     HF_CACHE_DIR=$WORKDIR/hf_cache
-#     mkdir -p "$HF_CACHE_DIR"
-# fi
-# echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
-ls $HF_CACHE_DIR/hub
+export HF_CACHE_DIR=/data2/huggingface
+if [ ! -d "$HF_CACHE_DIR" ]; then
+    HF_CACHE_DIR=$WORKDIR/hf_cache
+    mkdir -p "$HF_CACHE_DIR"
+fi
+echo  "HF_CACHE_DIR=$HF_CACHE_DIR"
+ls $HF_CACHE_DIR
 
 vllm_port=8086
 vllm_volume=${HF_CACHE_DIR}