Skip to content

Commit

Permalink
Support e2e and first token P90 statistics (#77)
Browse files Browse the repository at this point in the history
* Support e2e and first token P90 statistics

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix high-concurrency test issue

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
  • Loading branch information
lvliang-intel authored Aug 26, 2024
1 parent d754a84 commit b07cd12
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 12 deletions.
10 changes: 9 additions & 1 deletion evals/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,15 @@ pip install -r ../../requirements.txt

1 Define the test cases and configurations in the benchmark.yaml file.

2 Run the benchmark script:
2 Temporarily increase the file descriptor limit before run test:

```bash
ulimit -n 100000
```

This command increases the maximum number of file descriptors (which represent open files, network connections, etc.) that a single process can use. By default, many systems set a conservative limit, such as 1024, which may not be sufficient for high-concurrency applications or large-scale load testing. Raising this limit ensures that the process can handle a larger number of open connections or files without running into errors caused by insufficient file descriptors.

3 Run the benchmark script:

```bash
python benchmark.py
Expand Down
10 changes: 6 additions & 4 deletions evals/benchmark/stresscli/commands/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ RPS = RPS:\s+([\d.]+)
Input_Tokens_per_Second = Input Tokens per Second:\s+([\d.]+)
Output_Tokens_per_Second = Output Tokens per Second:\s+([\d.]+)
End_to_End_latency_P50 = End to End latency\(ms\),\s+P50:\s+([\d.]+)
End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+)
End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
End_to_End_latency_P90 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
End_to_End_latency_P99 = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
End_to_End_latency_Avg = End to End latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
First_token_latency_P50 = First token latency\(ms\),\s+P50:\s+([\d.]+)
First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+([\d.]+)
First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
First_token_latency_P90 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+([\d.]+)
First_token_latency_P99 = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+([\d.]+)
First_token_latency_Avg = First token latency\(ms\),\s+P50:[\s\d.,]+P90:\s+[\s\d.,]+P99:\s+[\s\d.,]+Avg:\s+([\d.]+)
Average_Next_token_latency = Average Next token latency\(ms\):\s+([\d.]+)
Average_token_latency = Average token latency\(ms\)\s+:\s+([\d.]+)
locust_num_requests = \"num_requests\":\s+(\d+)
Expand Down
7 changes: 5 additions & 2 deletions evals/benchmark/stresscli/commands/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
os.makedirs(end_output_folder, exist_ok=True)
metrics_output = os.path.join(output_folder, f"{index}_metrics.json")

spawn_rate = 100 if runspec["users"] > 100 else runspec["users"]
processes = 10 if runspec["max_requests"] > 2000 else 5 if runspec["max_requests"] > 1000 else 2

cmd = [
"locust",
"--locustfile",
Expand All @@ -126,11 +129,11 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
"--users",
str(runspec["users"]),
"--spawn-rate",
str(runspec["users"]),
str(spawn_rate),
"--max-request",
str(runspec["max_requests"]),
"--processes",
str(runspec["processes"]),
str(processes),
"--bench-target",
str(runspec["bench-target"]),
"--llm-model",
Expand Down
2 changes: 1 addition & 1 deletion evals/benchmark/stresscli/locust/aistress.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _(parser):
help="Stop the benchmark If exceed this request",
)
parser.add_argument(
"--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=3000, help="Http timeout before receive response"
"--http-timeout", type=int, env_var="HTTP_TIMEOUT", default=120000, help="Http timeout before receive response"
)
parser.add_argument(
"--bench-target",
Expand Down
16 changes: 12 additions & 4 deletions evals/benchmark/stresscli/locust/tokenresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def staticsOutput(environment, reqlist):
"Succeed Response: {} (Total {}, {:.1%} Success), Duration: {:.2f}s, Input Tokens: {},"
" Output Tokens: {}, RPS: {:.2f}, Input Tokens per Second: {:.2f}, Output Tokens per Second: {:.2f}"
)
e2e_msg = "End to End latency(ms), P50: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
first_msg = "First token latency(ms), P50: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
first_msg = "First token latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
next_msg = "Average Next token latency(ms): {:.2f}"
average_msg = "Average token latency(ms) : {:.2f}"
console_logger.warning("\n=================Total statistics=====================")
Expand Down Expand Up @@ -92,12 +92,20 @@ def staticsOutput(environment, reqlist):
)
)
console_logger.warning(
e2e_msg.format(numpy.percentile(e2e_lat, 50), numpy.percentile(e2e_lat, 99), numpy.average(e2e_lat))
e2e_msg.format(
numpy.percentile(e2e_lat, 50),
numpy.percentile(e2e_lat, 90),
numpy.percentile(e2e_lat, 99),
numpy.average(e2e_lat),
)
)
if tokens_output != 0:
console_logger.warning(
first_msg.format(
numpy.percentile(first_token, 50), numpy.percentile(first_token, 99), numpy.average(first_token)
numpy.percentile(first_token, 50),
numpy.percentile(first_token, 90),
numpy.percentile(first_token, 99),
numpy.average(first_token),
)
)
console_logger.warning(next_msg.format(numpy.average(next_token)))
Expand Down

0 comments on commit b07cd12

Please sign in to comment.