Skip to content

Commit

Permalink
Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with…
Browse files Browse the repository at this point in the history
… PyTorch version (#596)

**Description**
Make metrics of dist-inference-cpp aligned with PyTorch version.

---------

Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
  • Loading branch information
2 people authored and abuccts committed Jan 3, 2024
1 parent 0dd0a61 commit 6482a89
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 23 deletions.
9 changes: 4 additions & 5 deletions superbench/benchmarks/micro_benchmarks/dist_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,13 +493,12 @@ def _process_raw_result(self, cmd_idx, raw_output):

try:
output_lines = [x.strip() for x in raw_output.strip().splitlines()]
step_time = None
step_times = []
for output_line in output_lines:
if ' ms per iteration' in output_line:
step_time = float(output_line.split(' ms per iteration')[0].split()[-1])
break
if output_line.startswith('Latency of step'):
step_times.append(float(output_line.split(' ms')[0].split()[-1]))
return self._process_numeric_result(
'step_times', [step_time], reduce_type=ReduceType.MAX, cal_percentile=True
'step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True
)
except BaseException as e:
return self._set_error_code_and_print_error_msg(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
#endif

std::chrono::steady_clock::time_point start_time, stop_time;
std::vector<double> step_times(num_iters, 0.);
for (int i = 0; i < num_warmups + num_iters; ++i) {
if (i == num_warmups) {
if (i >= num_warmups) {
start_time = std::chrono::steady_clock::now();
}
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
Expand All @@ -357,11 +358,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
model_forward();
#endif
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
if (i >= num_warmups) {
stop_time = std::chrono::steady_clock::now();
double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
step_times[i - num_warmups] = step_time;
}
}
for (int i = 0; i < num_iters; i++) {
fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
}
stop_time = std::chrono::steady_clock::now();
double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
duration / num_iters / num_layers);

#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
// Destroy graph
Expand Down
19 changes: 8 additions & 11 deletions tests/benchmarks/micro_benchmarks/test_dist_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""Tests for distributed inference benchmark."""

import numbers
import unittest

from tests.helper import decorator
Expand Down Expand Up @@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output):
# step_times
assert (len(benchmark.raw_data) == 2)
# return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
assert (7 == len(benchmark.result))
for output_key in benchmark.result:
if output_key == 'return_code':
assert (benchmark.result[output_key] == [0])
else:
assert (output_key.startswith('step_times'))
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
assert (test_latency == benchmark.result[output_key][0])
assert (benchmark.result['return_code'] == [0])
assert (benchmark.result['step_times'] == [1.9052048])
assert (benchmark.result['step_times_50'] == [1.851])
assert (benchmark.result['step_times_90'] == [1.89637])
assert (benchmark.result['step_times_95'] == [2.12037])
assert (benchmark.result['step_times_99'] == [2.67155])
assert (benchmark.result['step_times_99.9'] == [4.4198])

# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)

@decorator.cuda_test
Expand Down
102 changes: 100 additions & 2 deletions tests/data/dist_inference.log
Original file line number Diff line number Diff line change
@@ -1,2 +1,100 @@
Parameters: m=80, n=128, k=128, alpha=1.000000, beta=1.000000, num_layers=50, num_warmups=20, num_iters=100, use_cuda_graph=0
Time: 173 ms in total, 1.73 ms per iteration, 0.0346 ms per layer
Latency of step 0: 1.8339 ms
Latency of step 1: 1.84222 ms
Latency of step 2: 1.90869 ms
Latency of step 3: 1.85375 ms
Latency of step 4: 1.87192 ms
Latency of step 5: 1.84254 ms
Latency of step 6: 1.91165 ms
Latency of step 7: 1.8214 ms
Latency of step 8: 1.91427 ms
Latency of step 9: 1.89586 ms
Latency of step 10: 1.86816 ms
Latency of step 11: 1.85105 ms
Latency of step 12: 1.84486 ms
Latency of step 13: 1.84915 ms
Latency of step 14: 1.82332 ms
Latency of step 15: 1.91444 ms
Latency of step 16: 1.85073 ms
Latency of step 17: 1.81812 ms
Latency of step 18: 2.67155 ms
Latency of step 19: 1.85119 ms
Latency of step 20: 1.87989 ms
Latency of step 21: 1.83932 ms
Latency of step 22: 1.84041 ms
Latency of step 23: 1.84789 ms
Latency of step 24: 1.85079 ms
Latency of step 25: 1.82229 ms
Latency of step 26: 1.83376 ms
Latency of step 27: 1.851 ms
Latency of step 28: 1.86246 ms
Latency of step 29: 1.8371 ms
Latency of step 30: 1.88932 ms
Latency of step 31: 1.84459 ms
Latency of step 32: 1.82725 ms
Latency of step 33: 1.83566 ms
Latency of step 34: 1.84041 ms
Latency of step 35: 1.87058 ms
Latency of step 36: 1.84038 ms
Latency of step 37: 1.85555 ms
Latency of step 38: 1.85848 ms
Latency of step 39: 2.40561 ms
Latency of step 40: 1.85029 ms
Latency of step 41: 1.84562 ms
Latency of step 42: 1.8351 ms
Latency of step 43: 1.84196 ms
Latency of step 44: 1.86032 ms
Latency of step 45: 1.87147 ms
Latency of step 46: 1.84832 ms
Latency of step 47: 1.85715 ms
Latency of step 48: 1.86012 ms
Latency of step 49: 1.86327 ms
Latency of step 50: 1.84388 ms
Latency of step 51: 1.86396 ms
Latency of step 52: 1.85538 ms
Latency of step 53: 1.85564 ms
Latency of step 54: 1.83979 ms
Latency of step 55: 1.85334 ms
Latency of step 56: 1.85712 ms
Latency of step 57: 1.85284 ms
Latency of step 58: 1.84534 ms
Latency of step 59: 1.86041 ms
Latency of step 60: 1.86305 ms
Latency of step 61: 2.2213 ms
Latency of step 62: 1.83054 ms
Latency of step 63: 4.4198 ms
Latency of step 64: 1.87245 ms
Latency of step 65: 1.83845 ms
Latency of step 66: 1.82047 ms
Latency of step 67: 1.81191 ms
Latency of step 68: 1.83887 ms
Latency of step 69: 1.8463 ms
Latency of step 70: 2.12037 ms
Latency of step 71: 1.85782 ms
Latency of step 72: 1.84939 ms
Latency of step 73: 1.82054 ms
Latency of step 74: 1.8866 ms
Latency of step 75: 1.83937 ms
Latency of step 76: 1.84167 ms
Latency of step 77: 1.89637 ms
Latency of step 78: 1.8392 ms
Latency of step 79: 1.83754 ms
Latency of step 80: 1.84721 ms
Latency of step 81: 1.88112 ms
Latency of step 82: 1.84474 ms
Latency of step 83: 1.84084 ms
Latency of step 84: 1.85134 ms
Latency of step 85: 1.85315 ms
Latency of step 86: 1.83406 ms
Latency of step 87: 1.87803 ms
Latency of step 88: 1.8369 ms
Latency of step 89: 1.85909 ms
Latency of step 90: 1.84519 ms
Latency of step 91: 2.52689 ms
Latency of step 92: 1.86594 ms
Latency of step 93: 1.86974 ms
Latency of step 94: 1.85219 ms
Latency of step 95: 1.86255 ms
Latency of step 96: 1.82652 ms
Latency of step 97: 1.84379 ms
Latency of step 98: 1.84553 ms
Latency of step 99: 1.87082 ms

0 comments on commit 6482a89

Please sign in to comment.