diff --git a/ChatQnA/benchmark_chatqna.yaml b/ChatQnA/benchmark_chatqna.yaml
index 1e31abf6f5..7f5ba4cabd 100644
--- a/ChatQnA/benchmark_chatqna.yaml
+++ b/ChatQnA/benchmark_chatqna.yaml
@@ -67,10 +67,10 @@ deploy:
 
 benchmark:
   # http request behavior related fields
-  user_queries:              [1, 2, 4]
-  load_shape_type:           "poisson" # "constant" or "poisson"
-  concurrent_level:          5
-  poisson_arrival_rate:      1.0
+  user_queries:              [640]
+  concurrency:               [128]
+  load_shape_type:           "constant" # "constant" or "poisson"
+  poisson_arrival_rate:      1.0  # only used when load_shape_type is "poisson"
   warmup_iterations:         10
   seed:                      1024
 
diff --git a/benchmark.py b/benchmark.py
index 60e583cb9f..3027b44224 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -26,8 +26,8 @@ def construct_benchmark_config(test_suite_config):
 
     return {
         "user_queries": test_suite_config.get("user_queries", [1]),
+        "concurrency": test_suite_config.get("concurrency", [1]),
         "load_shape_type": test_suite_config.get("load_shape_type", "constant"),
-        "concurrent_level": test_suite_config.get("concurrent_level", 5),
         "poisson_arrival_rate": test_suite_config.get("poisson_arrival_rate", 1.0),
         "warmup_iterations": test_suite_config.get("warmup_iterations", 10),
         "seed": test_suite_config.get("seed", None),
@@ -95,17 +95,11 @@ def _get_service_ip(service_name, deployment_type="k8s", service_ip=None, servic
     return svc_ip, port
 
 
-def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params):
+def _create_yaml_content(service, base_url, bench_target, test_phase, num_queries, test_params, concurrency=1):
     """Create content for the run.yaml file."""
 
-    # If a load shape includes the parameter concurrent_level,
-    # the parameter will be passed to Locust to launch fixed
-    # number of simulated users.
-    concurrency = 1
-    if num_queries >= 0:
-        concurrency = max(1, num_queries // test_params["concurrent_level"])
-    else:
-        concurrency = test_params["concurrent_level"]
+    # calculate the number of concurrent users
+    concurrent_level = int(num_queries // concurrency)
 
     import importlib.util
 
@@ -126,6 +120,9 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
         print("Fail to find the opea-eval package. Please install/download it first.")
         exit(1)
 
+    load_shape = test_params["load_shape"]
+    load_shape["params"]["constant"] = {"concurrent_level": concurrent_level}
+
     yaml_content = {
         "profile": {
             "storage": {"hostpath": test_params["test_output_dir"]},
@@ -134,7 +131,7 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
                 "locustfile": os.path.join(eval_path, "evals/benchmark/stresscli/locust/aistress.py"),
                 "host": base_url,
                 "stop-timeout": test_params["query_timeout"],
-                "processes": test_params["concurrent_level"],
+                "processes": 2,  # set to 2 by default
                 "namespace": test_params["namespace"],
                 "bench-target": bench_target,
                 "service-metric-collect": test_params["collect_service_metric"],
@@ -145,7 +142,7 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
                 "seed": test_params.get("seed", None),
                 "llm-model": test_params["llm_model"],
                 "deployment-type": test_params["deployment_type"],
-                "load-shape": test_params["load_shape"],
+                "load-shape": load_shape,
             },
             "runs": [{"name": test_phase, "users": concurrency, "max-request": num_queries}],
         }
@@ -158,7 +155,7 @@ def _create_yaml_content(service, base_url, bench_target, test_phase, num_querie
     return yaml_content
 
 
-def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts) -> str:
+def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, base_url, ts, concurrency=1) -> str:
     """Create a stresscli configuration file and persist it on disk."""
     stresscli_confs = []
     # Get the workload
@@ -168,7 +165,9 @@ def _create_stresscli_confs(case_params, test_params, test_phase, num_queries, b
         print(f"[OPEA BENCHMARK] 🚀 Running test for {b_target} in phase {test_phase} for {num_queries} queries")
         stresscli_conf["envs"] = {"DATASET": test_params["dataset"][i], "MAX_LINES": str(test_params["prompt"][i])}
         # Generate the content of stresscli configuration file
-        stresscli_yaml = _create_yaml_content(case_params, base_url, b_target, test_phase, num_queries, test_params)
+        stresscli_yaml = _create_yaml_content(
+            case_params, base_url, b_target, test_phase, num_queries, test_params, concurrency
+        )
 
         # Dump the stresscli configuration file
         service_name = case_params.get("service_name")
@@ -200,9 +199,19 @@ def create_stresscli_confs(service, base_url, test_suite_config, index):
         stresscli_confs.extend(_create_stresscli_confs(service, test_suite_config, "benchmark", -1, base_url, index))
     else:
         # Test stop is controlled by request count
-        for user_queries in user_queries_lst:
+        for i, user_query in enumerate(user_queries_lst):
+            concurrency_list = test_suite_config["concurrency"]
+            user_query *= test_suite_config["node_num"]
             stresscli_confs.extend(
-                _create_stresscli_confs(service, test_suite_config, "benchmark", user_queries, base_url, index)
+                _create_stresscli_confs(
+                    service,
+                    test_suite_config,
+                    "benchmark",
+                    user_query,
+                    base_url,
+                    index,
+                    concurrency=concurrency_list[i],
+                )
             )
 
     return stresscli_confs
@@ -327,7 +336,17 @@ def _run_service_test(example, service, test_suite_config, namespace):
     return output_folders
 
 
-def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, report=False):
+def run_benchmark(benchmark_config, chart_name, namespace, node_num=1, llm_model=None, report=False):
+    """Run the benchmark test for the specified helm chart and configuration.
+
+    Args:
+        benchmark_config (dict): The benchmark configuration.
+        chart_name (str): The name of the helm chart.
+        namespace (str): The namespace to deploy the chart.
+        node_num (int): The number of nodes of current deployment.
+        llm_model (str): The LLM model to use for the test.
+        report (bool): Whether to generate a report after the test.
+    """
     # If llm_model is None or an empty string, set to default value
     if not llm_model:
         llm_model = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -344,14 +363,14 @@ def run_benchmark(benchmark_config, chart_name, namespace, llm_model=None, repor
         "service_ip": None,  # Leave as None for k8s, specify for Docker
         "service_port": None,  # Leave as None for k8s, specify for Docker
         "test_output_dir": os.getcwd() + "/benchmark_output",  # The directory to store the test output
+        "node_num": node_num,
         "load_shape": {
             "name": parsed_data["load_shape_type"],
             "params": {
-                "constant": {"concurrent_level": parsed_data["concurrent_level"]},
                 "poisson": {"arrival_rate": parsed_data["poisson_arrival_rate"]},
             },
         },
-        "concurrent_level": parsed_data["concurrent_level"],
+        "concurrency": parsed_data["concurrency"],
         "arrival_rate": parsed_data["poisson_arrival_rate"],
         "query_timeout": 120,
         "warm_ups": parsed_data["warmup_iterations"],