Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks: Bug Fix - Make metrics of dist-inference-cpp aligned with PyTorch version #596

Merged
merged 10 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions superbench/benchmarks/micro_benchmarks/dist_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,13 +493,12 @@ def _process_raw_result(self, cmd_idx, raw_output):

try:
output_lines = [x.strip() for x in raw_output.strip().splitlines()]
step_time = None
step_times = []
for output_line in output_lines:
if ' ms per iteration' in output_line:
step_time = float(output_line.split(' ms per iteration')[0].split()[-1])
break
if output_line.startswith('Latency of step'):
step_times.append(float(output_line.split(' ms')[0].split()[-1]))
return self._process_numeric_result(
'step_times', [step_time], reduce_type=ReduceType.MAX, cal_percentile=True
'step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True
)
except BaseException as e:
return self._set_error_code_and_print_error_msg(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
#endif

std::chrono::steady_clock::time_point start_time, stop_time;
std::vector<double> step_times(num_iters, 0.);
for (int i = 0; i < num_warmups + num_iters; ++i) {
if (i == num_warmups) {
if (i >= num_warmups) {
start_time = std::chrono::steady_clock::now();
}
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
Expand All @@ -357,11 +358,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
model_forward();
#endif
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
if (i >= num_warmups) {
stop_time = std::chrono::steady_clock::now();
double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
step_times[i - num_warmups] = step_time;
}
}
for (int i = 0; i < num_iters; i++) {
fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
}
stop_time = std::chrono::steady_clock::now();
double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
duration / num_iters / num_layers);

#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
// Destroy graph
Expand Down
19 changes: 8 additions & 11 deletions tests/benchmarks/micro_benchmarks/test_dist_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""Tests for distributed inference benchmark."""

import numbers
import unittest

from tests.helper import decorator
Expand Down Expand Up @@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output):
# step_times
assert (len(benchmark.raw_data) == 2)
# return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
assert (7 == len(benchmark.result))
for output_key in benchmark.result:
if output_key == 'return_code':
assert (benchmark.result[output_key] == [0])
else:
assert (output_key.startswith('step_times'))
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
assert (test_latency == benchmark.result[output_key][0])
assert (benchmark.result['return_code'] == [0])
assert (benchmark.result['step_times'] == [1.9052048])
assert (benchmark.result['step_times_50'] == [1.851])
assert (benchmark.result['step_times_90'] == [1.89637])
assert (benchmark.result['step_times_95'] == [2.12037])
assert (benchmark.result['step_times_99'] == [2.67155])
assert (benchmark.result['step_times_99.9'] == [4.4198])

# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)

@decorator.cuda_test
Expand Down
102 changes: 100 additions & 2 deletions tests/data/dist_inference.log
Original file line number Diff line number Diff line change
@@ -1,2 +1,100 @@
Parameters: m=80, n=128, k=128, alpha=1.000000, beta=1.000000, num_layers=50, num_warmups=20, num_iters=100, use_cuda_graph=0
Time: 173 ms in total, 1.73 ms per iteration, 0.0346 ms per layer
Latency of step 0: 1.8339 ms
Latency of step 1: 1.84222 ms
Latency of step 2: 1.90869 ms
Latency of step 3: 1.85375 ms
Latency of step 4: 1.87192 ms
Latency of step 5: 1.84254 ms
Latency of step 6: 1.91165 ms
Latency of step 7: 1.8214 ms
Latency of step 8: 1.91427 ms
Latency of step 9: 1.89586 ms
Latency of step 10: 1.86816 ms
Latency of step 11: 1.85105 ms
Latency of step 12: 1.84486 ms
Latency of step 13: 1.84915 ms
Latency of step 14: 1.82332 ms
Latency of step 15: 1.91444 ms
Latency of step 16: 1.85073 ms
Latency of step 17: 1.81812 ms
Latency of step 18: 2.67155 ms
Latency of step 19: 1.85119 ms
Latency of step 20: 1.87989 ms
Latency of step 21: 1.83932 ms
Latency of step 22: 1.84041 ms
Latency of step 23: 1.84789 ms
Latency of step 24: 1.85079 ms
Latency of step 25: 1.82229 ms
Latency of step 26: 1.83376 ms
Latency of step 27: 1.851 ms
Latency of step 28: 1.86246 ms
Latency of step 29: 1.8371 ms
Latency of step 30: 1.88932 ms
Latency of step 31: 1.84459 ms
Latency of step 32: 1.82725 ms
Latency of step 33: 1.83566 ms
Latency of step 34: 1.84041 ms
Latency of step 35: 1.87058 ms
Latency of step 36: 1.84038 ms
Latency of step 37: 1.85555 ms
Latency of step 38: 1.85848 ms
Latency of step 39: 2.40561 ms
Latency of step 40: 1.85029 ms
Latency of step 41: 1.84562 ms
Latency of step 42: 1.8351 ms
Latency of step 43: 1.84196 ms
Latency of step 44: 1.86032 ms
Latency of step 45: 1.87147 ms
Latency of step 46: 1.84832 ms
Latency of step 47: 1.85715 ms
Latency of step 48: 1.86012 ms
Latency of step 49: 1.86327 ms
Latency of step 50: 1.84388 ms
Latency of step 51: 1.86396 ms
Latency of step 52: 1.85538 ms
Latency of step 53: 1.85564 ms
Latency of step 54: 1.83979 ms
Latency of step 55: 1.85334 ms
Latency of step 56: 1.85712 ms
Latency of step 57: 1.85284 ms
Latency of step 58: 1.84534 ms
Latency of step 59: 1.86041 ms
Latency of step 60: 1.86305 ms
Latency of step 61: 2.2213 ms
Latency of step 62: 1.83054 ms
Latency of step 63: 4.4198 ms
Latency of step 64: 1.87245 ms
Latency of step 65: 1.83845 ms
Latency of step 66: 1.82047 ms
Latency of step 67: 1.81191 ms
Latency of step 68: 1.83887 ms
Latency of step 69: 1.8463 ms
Latency of step 70: 2.12037 ms
Latency of step 71: 1.85782 ms
Latency of step 72: 1.84939 ms
Latency of step 73: 1.82054 ms
Latency of step 74: 1.8866 ms
Latency of step 75: 1.83937 ms
Latency of step 76: 1.84167 ms
Latency of step 77: 1.89637 ms
Latency of step 78: 1.8392 ms
Latency of step 79: 1.83754 ms
Latency of step 80: 1.84721 ms
Latency of step 81: 1.88112 ms
Latency of step 82: 1.84474 ms
Latency of step 83: 1.84084 ms
Latency of step 84: 1.85134 ms
Latency of step 85: 1.85315 ms
Latency of step 86: 1.83406 ms
Latency of step 87: 1.87803 ms
Latency of step 88: 1.8369 ms
Latency of step 89: 1.85909 ms
Latency of step 90: 1.84519 ms
Latency of step 91: 2.52689 ms
Latency of step 92: 1.86594 ms
Latency of step 93: 1.86974 ms
Latency of step 94: 1.85219 ms
Latency of step 95: 1.86255 ms
Latency of step 96: 1.82652 ms
Latency of step 97: 1.84379 ms
Latency of step 98: 1.84553 ms
Latency of step 99: 1.87082 ms
Loading