microsoft · yzygitzh · Dec 16, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
@@ -493,13 +493,12 @@ def _process_raw_result(self, cmd_idx, raw_output):
 
         try:
             output_lines = [x.strip() for x in raw_output.strip().splitlines()]
-            step_time = None
+            step_times = []
             for output_line in output_lines:
-                if ' ms per iteration' in output_line:
-                    step_time = float(output_line.split(' ms per iteration')[0].split()[-1])
-                    break
+                if output_line.startswith('Latency of step'):
+                    step_times.append(float(output_line.split(' ms')[0].split()[-1]))
             return self._process_numeric_result(
-                'step_times', [step_time], reduce_type=ReduceType.MAX, cal_percentile=True
+                'step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True
             )
         except BaseException as e:
             return self._set_error_code_and_print_error_msg(

@@ -343,8 +343,9 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
 #endif
 
     std::chrono::steady_clock::time_point start_time, stop_time;
+    std::vector<double> step_times(num_iters, 0.);
     for (int i = 0; i < num_warmups + num_iters; ++i) {
-        if (i == num_warmups) {
+        if (i >= num_warmups) {
             start_time = std::chrono::steady_clock::now();
         }
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
@@ -357,11 +358,15 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
         model_forward();
 #endif
         CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+        if (i >= num_warmups) {
+            stop_time = std::chrono::steady_clock::now();
+            double step_time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time).count();
+            step_times[i - num_warmups] = step_time;
+        }
+    }
+    for (int i = 0; i < num_iters; i++) {
+        fprintf(stdout, "Latency of step %d: %g ms\n", i, step_times[i] / 1e6);
     }
-    stop_time = std::chrono::steady_clock::now();
-    double duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count();
-    fprintf(stdout, "Time: %g ms in total, %g ms per iteration, %g ms per layer\n", duration, duration / num_iters,
-            duration / num_iters / num_layers);
 
 #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && (CUDART_VERSION >= 11030 || HIP_VERSION >= 50221310)
     // Destroy graph

@@ -3,7 +3,6 @@
 
 """Tests for distributed inference benchmark."""
 
-import numbers
 import unittest
 
 from tests.helper import decorator
@@ -209,19 +208,17 @@ def _test_dist_inference_result_parsing(self, platform, test_raw_output):
         # step_times
         assert (len(benchmark.raw_data) == 2)
         # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
-        test_latency = float(test_raw_output.splitlines()[-1].split(' ms per iteration')[0].split()[-1])
         assert (7 == len(benchmark.result))
-        for output_key in benchmark.result:
-            if output_key == 'return_code':
-                assert (benchmark.result[output_key] == [0])
-            else:
-                assert (output_key.startswith('step_times'))
-                assert (len(benchmark.result[output_key]) == 1)
-                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
-                assert (test_latency == benchmark.result[output_key][0])
+        assert (benchmark.result['return_code'] == [0])
+        assert (benchmark.result['step_times'] == [1.9052048])
+        assert (benchmark.result['step_times_50'] == [1.851])
+        assert (benchmark.result['step_times_90'] == [1.89637])
+        assert (benchmark.result['step_times_95'] == [2.12037])
+        assert (benchmark.result['step_times_99'] == [2.67155])
+        assert (benchmark.result['step_times_99.9'] == [4.4198])
 
         # Negative case - invalid raw output.
-        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
+        assert (benchmark._process_raw_result(1, 'Latency of step: xxx ms') is False)
         assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
 
     @decorator.cuda_test

@@ -1,2 +1,100 @@
-Parameters: m=80, n=128, k=128, alpha=1.000000, beta=1.000000, num_layers=50, num_warmups=20, num_iters=100, use_cuda_graph=0
-Time: 173 ms in total, 1.73 ms per iteration, 0.0346 ms per layer
+Latency of step 0: 1.8339 ms
+Latency of step 1: 1.84222 ms
+Latency of step 2: 1.90869 ms
+Latency of step 3: 1.85375 ms
+Latency of step 4: 1.87192 ms
+Latency of step 5: 1.84254 ms
+Latency of step 6: 1.91165 ms
+Latency of step 7: 1.8214 ms
+Latency of step 8: 1.91427 ms
+Latency of step 9: 1.89586 ms
+Latency of step 10: 1.86816 ms
+Latency of step 11: 1.85105 ms
+Latency of step 12: 1.84486 ms
+Latency of step 13: 1.84915 ms
+Latency of step 14: 1.82332 ms
+Latency of step 15: 1.91444 ms
+Latency of step 16: 1.85073 ms
+Latency of step 17: 1.81812 ms
+Latency of step 18: 2.67155 ms
+Latency of step 19: 1.85119 ms
+Latency of step 20: 1.87989 ms
+Latency of step 21: 1.83932 ms
+Latency of step 22: 1.84041 ms
+Latency of step 23: 1.84789 ms
+Latency of step 24: 1.85079 ms
+Latency of step 25: 1.82229 ms
+Latency of step 26: 1.83376 ms
+Latency of step 27: 1.851 ms
+Latency of step 28: 1.86246 ms
+Latency of step 29: 1.8371 ms
+Latency of step 30: 1.88932 ms
+Latency of step 31: 1.84459 ms
+Latency of step 32: 1.82725 ms
+Latency of step 33: 1.83566 ms
+Latency of step 34: 1.84041 ms
+Latency of step 35: 1.87058 ms
+Latency of step 36: 1.84038 ms
+Latency of step 37: 1.85555 ms
+Latency of step 38: 1.85848 ms
+Latency of step 39: 2.40561 ms
+Latency of step 40: 1.85029 ms
+Latency of step 41: 1.84562 ms
+Latency of step 42: 1.8351 ms
+Latency of step 43: 1.84196 ms
+Latency of step 44: 1.86032 ms
+Latency of step 45: 1.87147 ms
+Latency of step 46: 1.84832 ms
+Latency of step 47: 1.85715 ms
+Latency of step 48: 1.86012 ms
+Latency of step 49: 1.86327 ms
+Latency of step 50: 1.84388 ms
+Latency of step 51: 1.86396 ms
+Latency of step 52: 1.85538 ms
+Latency of step 53: 1.85564 ms
+Latency of step 54: 1.83979 ms
+Latency of step 55: 1.85334 ms
+Latency of step 56: 1.85712 ms
+Latency of step 57: 1.85284 ms
+Latency of step 58: 1.84534 ms
+Latency of step 59: 1.86041 ms
+Latency of step 60: 1.86305 ms
+Latency of step 61: 2.2213 ms
+Latency of step 62: 1.83054 ms
+Latency of step 63: 4.4198 ms
+Latency of step 64: 1.87245 ms
+Latency of step 65: 1.83845 ms
+Latency of step 66: 1.82047 ms
+Latency of step 67: 1.81191 ms
+Latency of step 68: 1.83887 ms
+Latency of step 69: 1.8463 ms
+Latency of step 70: 2.12037 ms
+Latency of step 71: 1.85782 ms
+Latency of step 72: 1.84939 ms
+Latency of step 73: 1.82054 ms
+Latency of step 74: 1.8866 ms
+Latency of step 75: 1.83937 ms
+Latency of step 76: 1.84167 ms
+Latency of step 77: 1.89637 ms
+Latency of step 78: 1.8392 ms
+Latency of step 79: 1.83754 ms
+Latency of step 80: 1.84721 ms
+Latency of step 81: 1.88112 ms
+Latency of step 82: 1.84474 ms
+Latency of step 83: 1.84084 ms
+Latency of step 84: 1.85134 ms
+Latency of step 85: 1.85315 ms
+Latency of step 86: 1.83406 ms
+Latency of step 87: 1.87803 ms
+Latency of step 88: 1.8369 ms
+Latency of step 89: 1.85909 ms
+Latency of step 90: 1.84519 ms
+Latency of step 91: 2.52689 ms
+Latency of step 92: 1.86594 ms
+Latency of step 93: 1.86974 ms
+Latency of step 94: 1.85219 ms
+Latency of step 95: 1.86255 ms
+Latency of step 96: 1.82652 ms
+Latency of step 97: 1.84379 ms
+Latency of step 98: 1.84553 ms
+Latency of step 99: 1.87082 ms