diff --git a/docs/design-docs/benchmarks.md b/docs/design-docs/benchmarks.md
index 3ccf7e4ce..f58504c78 100644
--- a/docs/design-docs/benchmarks.md
+++ b/docs/design-docs/benchmarks.md
@@ -238,22 +238,22 @@ result = {
'run_count': N,
'return_code': 0,
'raw_data': {
- 'throughput-train-float32': [[step1_time, ..., stepK_time], ..., […]],
- 'throughput-train-float16': [[step1_time, ..., stepK_time], ..., […]],
- 'throughput-inference-float32': [[step1_time, ..., stepK_time], ..., […]],
- 'throughput-inference-float16': [[step1_time, ..., stepK_time], ..., […]],
+ 'fp32_train_throughput': [[step1_throughput, ..., stepK_throughput], ..., […]],
+ 'fp16_train_throughput': [[step1_throughput, ..., stepK_throughput], ..., […]],
+ 'fp32_inference_throughput': [[step1_throughput, ..., stepK_throughput], ..., […]],
+ 'fp16_inference_throughput': [[step1_throughput, ..., stepK_throughput], ..., […]],
},
'result': {
- 'throughput-train-float32': [avg_throughput1, ..., avg_throughputN],
- 'throughput-train-float16': [avg_throughput1, ..., avg_throughputN],
- 'throughput-inference-float32': [avg_throughput1, ..., avg_throughputN],
- 'throughput-inference-float16': [avg_throughput1, ..., avg_throughputN],
+ 'fp32_train_throughput': [avg_throughput1, ..., avg_throughputN],
+ 'fp16_train_throughput': [avg_throughput1, ..., avg_throughputN],
+ 'fp32_inference_throughput': [avg_throughput1, ..., avg_throughputN],
+ 'fp16_inference_throughput': [avg_throughput1, ..., avg_throughputN],
},
'reduce_op': {
- 'throughput-train-float32': 'min',
- 'throughput-train-float16': 'min',
- 'throughput-inference-float32': None,
- 'throughput-inference-float16': None,
+ 'fp32_train_throughput': 'min',
+ 'fp16_train_throughput': 'min',
+ 'fp32_inference_throughput': None,
+ 'fp16_inference_throughput': None,
},
}
```
diff --git a/docs/user-tutorial/benchmarks/docker-benchmarks.md b/docs/user-tutorial/benchmarks/docker-benchmarks.md
index 73a617956..0cd9d04e5 100644
--- a/docs/user-tutorial/benchmarks/docker-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/docker-benchmarks.md
@@ -14,15 +14,15 @@ Run the rocm onnxruntime model training benchmarks packaged in docker `superbenc
#### Metrics
-| Name | Unit | Description |
-|-------------------------------------------------------|------------------------|-----------------------------------------------------------|
-| onnxruntime-ort-models/bert_large_uncased_ngpu_1 | throughput (samples/s) | The throughput of bert large uncased model on 1 GPU. |
-| onnxruntime-ort-models/bert_large_uncased_ngpu_8 | throughput (samples/s) | The throughput of bert large uncased model on 8 GPU. |
-| onnxruntime-ort-models/distilbert_base_uncased_ngpu_1 | throughput (samples/s) | The throughput of distilbert base uncased model on 1 GPU. |
-| onnxruntime-ort-models/distilbert_base_uncased_ngpu_8 | throughput (samples/s) | The throughput of distilbert base uncased model on 8 GPU. |
-| onnxruntime-ort-models/gpt2_ngpu_1 | throughput (samples/s) | The throughput of gpt2 model on 1 GPU. |
-| onnxruntime-ort-models/gpt2_ngpu_8 | throughput (samples/s) | The throughput of gpt2 model on 8 GPU. |
-| onnxruntime-ort-models/facebook_bart_large_ngpu_1 | throughput (samples/s) | The throughput of facebook bart large model on 1 GPU. |
-| onnxruntime-ort-models/facebook_bart_large_ngpu_8 | throughput (samples/s) | The throughput of facebook bart large model on 8 GPU. |
-| onnxruntime-ort-models/roberta_large_ngpu_1 | throughput (samples/s) | The throughput of roberta large model on 1 GPU. |
-| onnxruntime-ort-models/roberta_large_ngpu_8 | throughput (samples/s) | The throughput of roberta large model on 8 GPU. |
+| Name | Unit | Description |
+|------------------------------------------------------------------------|------------------------|-----------------------------------------------------------|
+| onnxruntime-ort-models/bert_large_uncased_ngpu_1_train_throughput | throughput (samples/s) | The throughput of bert large uncased model on 1 GPU. |
+| onnxruntime-ort-models/bert_large_uncased_ngpu_8_train_throughput | throughput (samples/s) | The throughput of bert large uncased model on 8 GPU. |
+| onnxruntime-ort-models/distilbert_base_uncased_ngpu_1_train_throughput | throughput (samples/s) | The throughput of distilbert base uncased model on 1 GPU. |
+| onnxruntime-ort-models/distilbert_base_uncased_ngpu_8_train_throughput | throughput (samples/s) | The throughput of distilbert base uncased model on 8 GPU. |
+| onnxruntime-ort-models/gpt2_ngpu_1_train_throughput | throughput (samples/s) | The throughput of gpt2 model on 1 GPU. |
+| onnxruntime-ort-models/gpt2_ngpu_8_train_throughput | throughput (samples/s) | The throughput of gpt2 model on 8 GPU. |
+| onnxruntime-ort-models/facebook_bart_large_ngpu_1_train_throughput | throughput (samples/s) | The throughput of facebook bart large model on 1 GPU. |
+| onnxruntime-ort-models/facebook_bart_large_ngpu_8_train_throughput | throughput (samples/s) | The throughput of facebook bart large model on 8 GPU. |
+| onnxruntime-ort-models/roberta_large_ngpu_1_train_throughput | throughput (samples/s) | The throughput of roberta large model on 1 GPU. |
+| onnxruntime-ort-models/roberta_large_ngpu_8_train_throughput | throughput (samples/s) | The throughput of roberta large model on 8 GPU. |
diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 08c6c7467..9f881da12 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -15,10 +15,10 @@ which is defined as the time range from the beginning of the launch API call to
#### Metrics
-| Name | Unit | Description |
-|------------------------------|-----------|--------------------------------------|
-| kernel-launch/event_overhead | time (ms) | Launch latency measured in GPU time. |
-| kernel-launch/wall_overhead | time (ms) | Launch latency measured in CPU time. |
+| Name | Unit | Description |
+|--------------------------|-----------|--------------------------------------|
+| kernel-launch/event_time | time (ms) | Launch latency measured in GPU time. |
+| kernel-launch/wall_time | time (ms) | Launch latency measured in CPU time. |
### `gemm-flops`
@@ -30,21 +30,21 @@ or AMD [rocblas-bench](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/deve
#### Metrics
-| Name | Unit | Description |
-|------------------------|----------------|---------------------------------------------------------|
-| gemm-flops/FP64 | FLOPS (GFLOPS) | GEMM float64 peak FLOPS. |
-| gemm-flops/FP32 | FLOPS (GFLOPS) | GEMM float32 peak FLOPS. |
-| gemm-flops/FP16 | FLOPS (GFLOPS) | GEMM float16 peak FLOPS. |
-| gemm-flops/FP64_TC | FLOPS (GFLOPS) | GEMM float64 peak FLOPS with NVIDIA Tensor Core. |
-| gemm-flops/TF32_TC | FLOPS (GFLOPS) | GEMM tensor-float32 peak FLOPS with NVIDIA Tensor Core. |
-| gemm-flops/FP16_TC | FLOPS (GFLOPS) | GEMM float16 peak FLOPS with NVIDIA Tensor Core. |
-| gemm-flops/BF16_TC | FLOPS (GFLOPS) | GEMM bfloat16 peak FLOPS with NVIDIA Tensor Core. |
-| gemm-flops/INT8_TC | IOPS (GIOPS) | GEMM int8 peak IOPS with NVIDIA Tensor Core. |
-| gemm-flops/INT4_TC | IOPS (GIOPS) | GEMM int4 peak IOPS with NVIDIA Tensor Core. |
-| gemm-flops/FP32_xDLOPS | FLOPS (GFLOPS) | GEMM tensor-float32 peak FLOPS with AMD XDLOPS. |
-| gemm-flops/FP16_xDLOPS | FLOPS (GFLOPS) | GEMM float16 peak FLOPS with AMD XDLOPS. |
-| gemm-flops/BF16_xDLOPS | FLOPS (GFLOPS) | GEMM bfloat16 peak FLOPS with AMD XDLOPS. |
-| gemm-flops/INT8_xDLOPS | IOPS (GIOPS) | GEMM int8 peak IOPS with AMD XDLOPS. |
+| Name | Unit | Description |
+|------------------------------|----------------|---------------------------------------------------------|
+| gemm-flops/fp64_flops | FLOPS (GFLOPS) | GEMM float64 peak FLOPS. |
+| gemm-flops/fp32_flops | FLOPS (GFLOPS) | GEMM float32 peak FLOPS. |
+| gemm-flops/fp16_flops | FLOPS (GFLOPS) | GEMM float16 peak FLOPS. |
+| gemm-flops/fp64_tc_flops | FLOPS (GFLOPS) | GEMM float64 peak FLOPS with NVIDIA Tensor Core. |
+| gemm-flops/tf32_tc_flops | FLOPS (GFLOPS) | GEMM tensor-float32 peak FLOPS with NVIDIA Tensor Core. |
+| gemm-flops/fp16_tc_flops | FLOPS (GFLOPS) | GEMM float16 peak FLOPS with NVIDIA Tensor Core. |
+| gemm-flops/bf16_tc_flops | FLOPS (GFLOPS) | GEMM bfloat16 peak FLOPS with NVIDIA Tensor Core. |
+| gemm-flops/int8_tc_iops | IOPS (GIOPS) | GEMM int8 peak IOPS with NVIDIA Tensor Core. |
+| gemm-flops/int4_tc_iops | IOPS (GIOPS) | GEMM int4 peak IOPS with NVIDIA Tensor Core. |
+| gemm-flops/fp32_xdlops_flops | FLOPS (GFLOPS) | GEMM tensor-float32 peak FLOPS with AMD XDLOPS. |
+| gemm-flops/fp16_xdlops_flops | FLOPS (GFLOPS) | GEMM float16 peak FLOPS with AMD XDLOPS. |
+| gemm-flops/bf16_xdlops_flops | FLOPS (GFLOPS) | GEMM bfloat16 peak FLOPS with AMD XDLOPS. |
+| gemm-flops/int8_xdlops_iops | IOPS (GIOPS) | GEMM int8 peak IOPS with AMD XDLOPS. |
### `matmul`
@@ -54,9 +54,9 @@ Large scale matmul operation using `torch.matmul` with one GPU.
#### Metrics
-| Name | Unit | Description |
-|---------------------------|-----------|--------------------------------|
-| pytorch-matmul/nosharding | time (ms) | Time of pure matmul operation. |
+| Name | Unit | Description |
+|--------------------------------|-----------|--------------------------------|
+| pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. |
### `cublas-function`
@@ -74,14 +74,14 @@ Inference PyTorch/ONNX models on NVIDIA GPUs with [TensorRT](https://developer.n
#### Metrics
-| Name | Unit | Description |
-|-------------------------------------------|-----------|----------------------------------------------------------------------------------------------------------|
-| tensorrt-inference/gpu_lat_ms_mean | time (ms) | The mean GPU latency to execute the kernels for a query. |
-| tensorrt-inference/gpu_lat_ms_99 | time (ms) | The 99th percentile GPU latency to execute the kernels for a query. |
-| tensorrt-inference/host_lat_ms_mean | time (ms) | The mean H2D, GPU, and D2H latency to execute the kernels for a query. |
-| tensorrt-inference/host_lat_ms_99 | time (ms) | The 99th percentile H2D, GPU, and D2H latency to execute the kernels for a query. |
-| tensorrt-inference/end_to_end_lat_ms_mean | time (ms) | The mean duration from when the H2D of a query is called to when the D2H of the same query is completed. |
-| tensorrt-inference/end_to_end_lat_ms_99 | time (ms) | The P99 duration from when the H2D of a query is called to when the D2H of the same query is completed. |
+| Name | Unit | Description |
+|--------------------------------------------------|-----------|----------------------------------------------------------------------------------------------------------|
+| tensorrt-inference/${model}_gpu_time_mean | time (ms) | The mean GPU latency to execute the kernels for a query. |
+| tensorrt-inference/${model}_gpu_time_99 | time (ms) | The 99th percentile GPU latency to execute the kernels for a query. |
+| tensorrt-inference/${model}_host_time_mean | time (ms) | The mean H2D, GPU, and D2H latency to execute the kernels for a query. |
+| tensorrt-inference/${model}_host_time_99 | time (ms) | The 99th percentile H2D, GPU, and D2H latency to execute the kernels for a query. |
+| tensorrt-inference/${model}_end_to_end_time_mean | time (ms) | The mean duration from when the H2D of a query is called to when the D2H of the same query is completed. |
+| tensorrt-inference/${model}_end_to_end_time_99 | time (ms) | The P99 duration from when the H2D of a query is called to when the D2H of the same query is completed. |
## Communication Benchmarks
@@ -95,11 +95,11 @@ or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils
#### Metrics
-| Name | Unit | Description |
-|-------------------|------------------|----------------------------------|
-| mem-bw/H2D_Mem_BW | bandwidth (GB/s) | Host to device copy bandwidth. |
-| mem-bw/D2H_Mem_BW | bandwidth (GB/s) | Device to host copy bandwidth. |
-| mem-bw/D2D_Mem_BW | bandwidth (GB/s) | Device to device copy bandwidth. |
+| Name | Unit | Description |
+|---------------|------------------|----------------------------------|
+| mem-bw/h2d_bw | bandwidth (GB/s) | Host to device copy bandwidth. |
+| mem-bw/d2h_bw | bandwidth (GB/s) | Device to host copy bandwidth. |
+| mem-bw/d2d_bw | bandwidth (GB/s) | Device to device copy bandwidth. |
### `gpu-copy-bw`
@@ -107,11 +107,11 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi
#### Metrics
-| Name | Unit | Description |
-|----------------------------------------------------------------------------|------------------|----------------------------------------------------------------------------------------------------------------------------|
-| cpu\_to\_gpu[0-9]+\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+ | bandwidth (GB/s) | The bandwidth reading from all NUMA nodes' host memory using DMA engine or GPU SM by all GPUs. |
-| gpu[0-9]+\_to\_cpu\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+ | bandwidth (GB/s) | The bandwidth writing to all NUMA nodes' host memory using DMA engine or GPU SM by all GPUs. |
-| gpu[0-9]+\_to_gpu[0-9]+\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+ | bandwidth (GB/s) | The bandwidth reading from or writing to all GPUs using DMA engine or GPU SM by all GPUs with peer communication enabled. |
+| Name | Unit | Description |
+|-------------------------------------------------------------------------------|------------------|----------------------------------------------------------------------------------------------------------------------------|
+| cpu\_to\_gpu[0-9]+\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+_bw | bandwidth (GB/s) | The bandwidth reading from all NUMA nodes' host memory using DMA engine or GPU SM by all GPUs. |
+| gpu[0-9]+\_to\_cpu\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+_bw | bandwidth (GB/s) | The bandwidth writing to all NUMA nodes' host memory using DMA engine or GPU SM by all GPUs. |
+| gpu[0-9]+\_to_gpu[0-9]+\_by\_gpu[0-9]+\_using\_(sm\|dma)\_under_numa[0-9]+_bw | bandwidth (GB/s) | The bandwidth reading from or writing to all GPUs using DMA engine or GPU SM by all GPUs with peer communication enabled. |
### `ib-loopback`
@@ -122,11 +122,11 @@ Measure the InfiniBand loopback verbs bandwidth, performed by
#### Metrics
-| Name | Unit | Description |
-|----------------------------------------------------|------------------|--------------------------------------------------------------|
-| ib-loopback/IB\_write\_${msg\_size}\_Avg_${ib_dev} | bandwidth (MB/s) | InfiniBand loopback write bandwidth with given message size. |
-| ib-loopback/IB\_read\_${msg\_size}\_Avg_${ib_dev} | bandwidth (MB/s) | InfiniBand loopback read bandwidth with given message size. |
-| ib-loopback/IB\_send\_${msg\_size}\_Avg_${ib_dev} | bandwidth (MB/s) | InfiniBand loopback send bandwidth with given message size. |
+| Name | Unit | Description |
+|---------------------------------------------|------------------|--------------------------------------------------------------|
+| ib-loopback/ib_write_${msg_size}_ib[0-9]_bw | bandwidth (GB/s) | InfiniBand loopback write bandwidth with given message size. |
+| ib-loopback/ib_read_${msg_size}_ib[0-9]_bw | bandwidth (GB/s) | InfiniBand loopback read bandwidth with given message size. |
+| ib-loopback/ib_send_${msg_size}_ib[0-9]_bw | bandwidth (GB/s) | InfiniBand loopback send bandwidth with given message size. |
### `nccl-bw` / `rccl-bw`
@@ -157,14 +157,14 @@ performed by [tcping](https://github.com/zhengxiaowai/tcping)
#### Metrics
-| Metrics | Unit | Description |
-| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------ |
-| tcp-connectivity/Successed_${hostname/ip} | count | successed times of tcp connections between current node and other nodes |
-| tcp-connectivity/Failed_${hostname/ip} | count | failed times of tcp connections between current node and other nodes |
-| tcp-connectivity/Success_Rate_${hostname/ip} | count | success rate(successed/total) of tcp connection between current node and other nodes |
-| tcp-connectivity/Minimum_${hostname/ip} | time(ms) | mininum latency of tcp connections between current node and other nodes |
-| tcp-connectivity/Maximum_${hostname/ip} | time(ms) | maximum latency of tcp connections between current node and other nodes |
-| tcp-connectivity/Average_${hostname/ip} | time(ms) | average latency of tcp connections between current node and other nodes |
+| Metrics | Unit | Description |
+|-------------------------------------------------|-----------|---------------------------------------------------------------------------------------|
+| tcp-connectivity/${hostname/ip}_successed_count | count | successed times of tcp connections between current node and other nodes |
+| tcp-connectivity/${hostname/ip}_failed_count | count | failed times of tcp connections between current node and other nodes |
+| tcp-connectivity/${hostname/ip}_success_rate | | success rate (successed/total) of tcp connection between current node and other nodes |
+| tcp-connectivity/${hostname/ip}_time_min | time (ms) | mininum latency of tcp connections between current node and other nodes |
+| tcp-connectivity/${hostname/ip}_time_max | time (ms) | maximum latency of tcp connections between current node and other nodes |
+| tcp-connectivity/${hostname/ip}_time_avg | time (ms) | average latency of tcp connections between current node and other nodes |
### `gpcnet-network-test` / `gpcnet-network-load-test`
@@ -177,22 +177,23 @@ gpcnet-network-test: Full system network tests in random and natural ring, allto
gpcnet-network-load-test: Select full system network tests run with four congestors to measure network congestion or contention, at least 10 nodes
- - test title: Isolated Network Tests, Isolated Congestion Tests, Network Tests running with Congestion Tests ( RR Two-sided Lat Network Test), Network Tests running with Congestion Tests (RR Two-sided BW+Sync Network Test), Network Tests running with Congestion Tests ( Multiple Allreduce Network Test), Network Tests running with Congestion Tests - Key Results
- - supporting network tests: RR Two-sided Lat (8 B), RR Two-sided BW+Sync (131072 B), Multiple Allreduce (8 B)
- - supporting congetors: Alltoall (4096 B), Two-sided Incast (4096 B), Put Incast (4096 B), Get Bcast (4096 B)
+ - supporting network tests: RR Two-sided Lat (8 B), RR Get Lat (8 B), RR Two-sided BW (131072 B), RR Put BW (131072 B), RR Two-sided BW+Sync (131072 B), Nat Two-sided BW (131072 B), Multiple Allreduce (8 B), Multiple Alltoall (4096 B)
+ - supporting congestors: Alltoall (4096 B), Two-sided Incast (4096 B), Put Incast (4096 B), Get Bcast (4096 B)
#### Metrics
-| Metrics | Unit | Description |
-| ------------------------------------------------------------------- | --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| {benchmark_name}/${test_title}_RRTwo-sidedLat(8B)_${stat} | time(usec) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use algorithm 'random ring communication pattern two-side latency' for network testing |
-| {benchmark_name}/${test_title}_RRTwo-sidedBW+Sync(131072B)_${stat} | MiB/s/rank | fstatistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use algorithm 'random ring communication pattern two-side bandwidth with barrier' for network testing |
-| {benchmark_name}/${test_title}_MultipleAllreduce(8B)_${stat} | time(usec) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use algorithm 'multiple allreduce bandwidth' for network testing |
-| {benchmark_name}/${test_title}_GetBcast(4096B)_${stat} | bandwidth (MB/s/rank) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use congestion 'Get Bcast(4096B)' for congestion testing |
-| {benchmark_name}/${test_title}_PutIncast(4096B)_${stat} | bandwidth (MB/s/rank) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use congestion 'Put Incast (4096 B)' for congestion testing |
-| {benchmark_name}/${test_title}_Two-sidedIncast(4096B)_${stat} | bandwidth (MB/s/rank) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use congestion 'Two-sided Incast (4096 B)' for congestion testing |
-| {benchmark_name}/${test_title}_Alltoall(4096B)_${stat} | bandwidth (MB/s/rank) | statistical values(Min, Max, Avg, 99%, 99.9%) obtained by all nodes use congestion 'Alltoall (4096 B)' for congestion testing |
-| gpcnet-network-load-test/${test_title}_${network_test_algo}_${stat} | times(x) | summary about congestion impact factor of every network test algorithm |
+| Metrics | Unit | Description |
+|---------------------------------------------------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| gpcnet-network-test/rr_two-sided_lat_${stat} | time (us) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'random ring communication pattern two-side latency' for network testing |
+| gpcnet-network-test/rr_two-sided+sync_bw_${stat} | bandwidth (MiB/s/rank) | fstatistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'random ring communication pattern two-side bandwidth with barrier' for network testing |
+| gpcnet-network-test/multiple_allreduce_time_${stat} | time (us) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'multiple allreduce bandwidth' for network testing |
+| gpcnet-network-test/rr_get_lat_${stat} | bandwidth (MiB/s/rank) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'RR GetLat (8 B)' for network testing |
+| gpcnet-network-test/rr_two-sided_bw_${stat} | bandwidth (MiB/s/rank) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'RR Two-sidedBW (131072 B)' for network testing |
+| gpcnet-network-test/nat_two-sided_bw_${stat} | bandwidth (MiB/s/rank) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'Nat Two-sidedBW (131072 B)' for network testing |
+| gpcnet-network-test/multiple_alltoall_bw_${stat} | bandwidth (MiB/s/rank) | statistical values(min, max, avg, 99%, 99.9%) obtained by all nodes use algorithm 'Multiple Alltoall (4096 B)' for network testing |
+| gpcnet-network-load-test/rr_two-sided_lat_x_${stat} | factor (x) | summary about congestion impact factor of the network test algorithm |
+| gpcnet-network-load-test/rr_two-sided+sync_bw_x_${stat} | factor (x) | summary about congestion impact factor of the network test algorithm |
+| gpcnet-network-load-test/multiple_allreduce_x_${stat} | factor (x) | summary about congestion impact factor of the network test algorithm |
### `ib-traffic`
@@ -204,11 +205,11 @@ The traffic pattern is defined in a config file, which is pre-defined for one-to
Each row in the config is one round, and all pairs of nodes in a row run ib command simultaneously.
#### Metrics
-
-| Metrics | Unit | Description |
-| -------------------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
-| ib-traffic/${command}-${line}-${pair} | bandwidth (MB/s) | The average bandwidth of ib command (ib_write_bw, ib_send_bw, ib_read_bw) run between the ${pair}th node pair in the ${line}th line of the config |
-| ib-traffic/${command}-${line}-${pair} | time (us) | The max latency of ib command (ib_write_lat, ib_send_lat, ib_read_lat) run between the ${pair}th node pair in the ${line}th line of the config |
+
+| Metrics | Unit | Description |
+|---------------------------------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| ib-traffic/${command}_${line}_${pair}_${server}_${client}_bw | bandwidth (GB/s) | The max bandwidth of ib command (ib_write_bw, ib_send_bw, ib_read_bw) run between the ${pair}th node pair in the ${line}th line of the config, ${server} and ${client} are the hostname of server and client |
+| ib-traffic/${command}_${line}_${pair}_${server}_${client}_lat | time (us) | The max latency of ib command (ib_write_lat, ib_send_lat, ib_read_lat) run between the ${pair}th node pair in the ${line}th line of the config, ${server} and ${client} are the hostname of server and client |
## Computation-communication Benchmarks
@@ -223,8 +224,8 @@ Test the performance of single node when communication and computation overlap.
| Name | Unit | Description |
|-------------------------------------------------------|-----------|--------------------------------------------------------------|
-| pytorch-computation-communication-overlap/mul_cost | time (ms) | Time of communication and mul kernel computation overlap. |
-| pytorch-computation-communication-overlap/matmul_cost | time (ms) | Time of communication and matmul kernel computation overlap. |
+| pytorch-computation-communication-overlap/mul_time | time (ms) | Time of communication and mul kernel computation overlap. |
+| pytorch-computation-communication-overlap/matmul_time | time (ms) | Time of communication and matmul kernel computation overlap. |
####
@@ -238,10 +239,10 @@ Test the performance of large scale matmul operation with multiple GPUs:
#### Metrics
-| Name | Unit | Description |
-|-----------------------------------|-----------|------------------------------------------|
-| pytorch-sharding-matmul/allreduce | time (ms) | Time of sharding matmul using allreduce. |
-| pytorch-sharding-matmul/allgather | time (ms) | Time of sharding matmul using allgather. |
+| Name | Unit | Description |
+|----------------------------------------|-----------|------------------------------------------|
+| pytorch-sharding-matmul/allreduce_time | time (ms) | Time of sharding matmul using allreduce. |
+| pytorch-sharding-matmul/allgather_time | time (ms) | Time of sharding matmul using allgather. |
## Storage Benchmarks
@@ -253,14 +254,14 @@ Measure the disk performance through [FIO](https://github.com/axboe/fio/tree/031
#### Metrics
-| Name | Unit | Description |
-|--------------------------------------------------------------------|--------------|----------------------------------------------------------|
-| disk-benchmark/${disk_name}_rand_read_write_bs | size (bytes) | Disk random read write block size. |
-| disk-benchmark/${disk_name}_rand_read_write_read_iops | IOPS | Disk random read write read IOPS. |
-| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_95.000000 | time (ns) | Disk random read write read latency in 95.0 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_99.000000 | time (ns) | Disk random read write read latency in 99.0 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_99.900000 | time (ns) | Disk random read write read latency in 99.9 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_write_iops | IOPS | Disk random read write write IOPS. |
-| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_95.000000 | time (ns) | Disk random read write write latency in 95.0 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.000000 | time (ns) | Disk random read write write latency in 99.0 percentile. |
-| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.900000 | time (ns) | Disk random read write write latency in 99.9 percentile. |
+| Name | Unit | Description |
+|---------------------------------------------------------------|--------------|----------------------------------------------------------|
+| disk-benchmark/${disk_name}_rand_read_write_bs | size (bytes) | Disk random read write block size. |
+| disk-benchmark/${disk_name}_rand_read_write_read_iops | IOPS | Disk random read write read IOPS. |
+| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_95.0 | time (ns) | Disk random read write read latency in 95.0 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_99.0 | time (ns) | Disk random read write read latency in 99.0 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_read_lat_ns_99.9 | time (ns) | Disk random read write read latency in 99.9 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_write_iops | IOPS | Disk random read write write IOPS. |
+| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_95.0 | time (ns) | Disk random read write write latency in 95.0 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.0 | time (ns) | Disk random read write write latency in 99.0 percentile. |
+| disk-benchmark/${disk_name}_rand_read_write_write_lat_ns_99.9 | time (ns) | Disk random read write write latency in 99.9 percentile. |
diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index 0e37e6878..7c8d35930 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -15,16 +15,16 @@ including gpt2-small, gpt2-medium, gpt2-large and gpt2-xl.
#### Metrics
-| Name | Unit | Description |
-|---------------------------------------------------------------|------------------------|---------------------------------------------|
-| gpt_models/pytorch-${model_name}/steptime_train_float32 | time (ms) | Train step time with single precision. |
-| gpt_models/pytorch-${model_name}/throughput_train_float32 | throughput (samples/s) | Train throughput with single precision. |
-| gpt_models/pytorch-${model_name}/steptime_inference_float32 | time (ms) | Inference step time with single precision. |
-| gpt_models/pytorch-${model_name}/throughput_inference_float32 | throughput (samples/s) | Inference throughput with single precision. |
-| gpt_models/pytorch-${model_name}/steptime_train_float16 | time (ms) | Train step time with half precision. |
-| gpt_models/pytorch-${model_name}/throughput_train_float16 | throughput (samples/s) | Train throughput with half precision. |
-| gpt_models/pytorch-${model_name}/steptime_inference_float16 | time (ms) | Inference step time with half precision. |
-| gpt_models/pytorch-${model_name}/throughput_inference_float16 | throughput (samples/s) | Inference throughput with half precision. |
+| Name | Unit | Description |
+|------------------------------------------------------------|------------------------|---------------------------------------------|
+| gpt_models/pytorch-${model_name}/fp32_train_step_time | time (ms) | Train step time with single precision. |
+| gpt_models/pytorch-${model_name}/fp32_train_throughput | throughput (samples/s) | Train throughput with single precision. |
+| gpt_models/pytorch-${model_name}/fp32_inference_step_time | time (ms) | Inference step time with single precision. |
+| gpt_models/pytorch-${model_name}/fp32_inference_throughput | throughput (samples/s) | Inference throughput with single precision. |
+| gpt_models/pytorch-${model_name}/fp16_train_step_time | time (ms) | Train step time with half precision. |
+| gpt_models/pytorch-${model_name}/fp16_train_throughput | throughput (samples/s) | Train throughput with half precision. |
+| gpt_models/pytorch-${model_name}/fp16_inference_step_time | time (ms) | Inference step time with half precision. |
+| gpt_models/pytorch-${model_name}/fp16_inference_throughput | throughput (samples/s) | Inference throughput with half precision. |
### `bert_models`
@@ -34,16 +34,16 @@ Run training or inference tasks with single or half precision for BERT models, i
#### Metrics
-| Name | Unit | Description |
-|----------------------------------------------------------------|------------------------|---------------------------------------------|
-| bert_models/pytorch-${model_name}/steptime_train_float32 | time (ms) | Train step time with single precision. |
-| bert_models/pytorch-${model_name}/throughput_train_float32 | throughput (samples/s) | Train throughput with single precision. |
-| bert_models/pytorch-${model_name}/steptime_inference_float32 | time (ms) | Inference step time with single precision. |
-| bert_models/pytorch-${model_name}/throughput_inference_float32 | throughput (samples/s) | Inference throughput with single precision. |
-| bert_models/pytorch-${model_name}/steptime_train_float16 | time (ms) | Train step time with half precision. |
-| bert_models/pytorch-${model_name}/throughput_train_float16 | throughput (samples/s) | Train throughput with half precision. |
-| bert_models/pytorch-${model_name}/steptime_inference_float16 | time (ms) | Inference step time with half precision. |
-| bert_models/pytorch-${model_name}/throughput_inference_float16 | throughput (samples/s) | Inference throughput with half precision. |
+| Name | Unit | Description |
+|-------------------------------------------------------------|------------------------|---------------------------------------------|
+| bert_models/pytorch-${model_name}/fp32_train_step_time | time (ms) | Train step time with single precision. |
+| bert_models/pytorch-${model_name}/fp32_train_throughput | throughput (samples/s) | Train throughput with single precision. |
+| bert_models/pytorch-${model_name}/fp32_inference_step_time | time (ms) | Inference step time with single precision. |
+| bert_models/pytorch-${model_name}/fp32_inference_throughput | throughput (samples/s) | Inference throughput with single precision. |
+| bert_models/pytorch-${model_name}/fp16_train_step_time | time (ms) | Train step time with half precision. |
+| bert_models/pytorch-${model_name}/fp16_train_throughput | throughput (samples/s) | Train throughput with half precision. |
+| bert_models/pytorch-${model_name}/fp16_inference_step_time | time (ms) | Inference step time with half precision. |
+| bert_models/pytorch-${model_name}/fp16_inference_throughput | throughput (samples/s) | Inference throughput with half precision. |
### `lstm_models`
@@ -53,16 +53,16 @@ Run training or inference tasks with single or half precision for one bidirectio
#### Metrics
-| Name | Unit | Description |
-|-------------------------------------------------------|------------------------|---------------------------------------------|
-| lstm_models/pytorch-lstm/steptime_train_float32 | time (ms) | Train step time with single precision. |
-| lstm_models/pytorch-lstm/throughput_train_float32 | throughput (samples/s) | Train throughput with single precision. |
-| lstm_models/pytorch-lstm/steptime_inference_float32 | time (ms) | Inference step time with single precision. |
-| lstm_models/pytorch-lstm/throughput_inference_float32 | throughput (samples/s) | Inference throughput with single precision. |
-| lstm_models/pytorch-lstm/steptime_train_float16 | time (ms) | Train step time with half precision. |
-| lstm_models/pytorch-lstm/throughput_train_float16 | throughput (samples/s) | Train throughput with half precision. |
-| lstm_models/pytorch-lstm/steptime_inference_float16 | time (ms) | Inference step time with half precision. |
-| lstm_models/pytorch-lstm/throughput_inference_float16 | throughput (samples/s) | Inference throughput with half precision. |
+| Name | Unit | Description |
+|----------------------------------------------------|------------------------|---------------------------------------------|
+| lstm_models/pytorch-lstm/fp32_train_step_time | time (ms) | Train step time with single precision. |
+| lstm_models/pytorch-lstm/fp32_train_throughput | throughput (samples/s) | Train throughput with single precision. |
+| lstm_models/pytorch-lstm/fp32_inference_step_time | time (ms) | Inference step time with single precision. |
+| lstm_models/pytorch-lstm/fp32_inference_throughput | throughput (samples/s) | Inference throughput with single precision. |
+| lstm_models/pytorch-lstm/fp16_train_step_time | time (ms) | Train step time with half precision. |
+| lstm_models/pytorch-lstm/fp16_train_throughput | throughput (samples/s) | Train throughput with half precision. |
+| lstm_models/pytorch-lstm/fp16_inference_step_time | time (ms) | Inference step time with half precision. |
+| lstm_models/pytorch-lstm/fp16_inference_throughput | throughput (samples/s) | Inference throughput with half precision. |
### `cnn_models`
@@ -83,13 +83,13 @@ Run training or inference tasks with single or half precision for CNN models lis
#### Metrics
-| Name | Unit | Description |
-|---------------------------------------------------------------|------------------------|---------------------------------------------|
-| cnn_models/pytorch-${model_name}/steptime_train_float32 | time (ms) | Train step time with single precision. |
-| cnn_models/pytorch-${model_name}/throughput_train_float32 | throughput (samples/s) | Train throughput with single precision. |
-| cnn_models/pytorch-${model_name}/steptime_inference_float32 | time (ms) | Inference step time with single precision. |
-| cnn_models/pytorch-${model_name}/throughput_inference_float32 | throughput (samples/s) | Inference throughput with single precision. |
-| cnn_models/pytorch-${model_name}/steptime_train_float16 | time (ms) | Train step time with half precision. |
-| cnn_models/pytorch-${model_name}/throughput_train_float16 | throughput (samples/s) | Train throughput with half precision. |
-| cnn_models/pytorch-${model_name}/steptime_inference_float16 | time (ms) | Inference step time with half precision. |
-| cnn_models/pytorch-${model_name}/throughput_inference_float16 | throughput (samples/s) | Inference throughput with half precision. |
+| Name | Unit | Description |
+|------------------------------------------------------------|------------------------|---------------------------------------------|
+| cnn_models/pytorch-${model_name}/fp32_train_step_time | time (ms) | Train step time with single precision. |
+| cnn_models/pytorch-${model_name}/fp32_train_throughput | throughput (samples/s) | Train throughput with single precision. |
+| cnn_models/pytorch-${model_name}/fp32_inference_step_time | time (ms) | Inference step time with single precision. |
+| cnn_models/pytorch-${model_name}/fp32_inference_throughput | throughput (samples/s) | Inference throughput with single precision. |
+| cnn_models/pytorch-${model_name}/fp16_train_step_time | time (ms) | Train step time with half precision. |
+| cnn_models/pytorch-${model_name}/fp16_train_throughput | throughput (samples/s) | Train throughput with half precision. |
+| cnn_models/pytorch-${model_name}/fp16_inference_step_time | time (ms) | Inference step time with half precision. |
+| cnn_models/pytorch-${model_name}/fp16_inference_throughput | throughput (samples/s) | Inference throughput with half precision. |
diff --git a/superbench/benchmarks/docker_benchmarks/rocm_onnxruntime_performance.py b/superbench/benchmarks/docker_benchmarks/rocm_onnxruntime_performance.py
index 025eff813..672899b16 100644
--- a/superbench/benchmarks/docker_benchmarks/rocm_onnxruntime_performance.py
+++ b/superbench/benchmarks/docker_benchmarks/rocm_onnxruntime_performance.py
@@ -73,7 +73,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
model_name = model_name.replace(char, '_')
elif value_prefix in line and model_name is not None:
throughput = float(line[len(value_prefix):])
- self._result.add_result(model_name, throughput)
+ self._result.add_result(model_name + '_throughput', throughput)
model_name = None
except BaseException as e:
logger.error(
diff --git a/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py b/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
index 764ff3173..5bad18982 100644
--- a/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
+++ b/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
@@ -237,7 +237,7 @@ def _benchmark(self):
compute_end = time.perf_counter()
torch.cuda.synchronize()
- compute_metric = '{}_cost'.format(kernel)
+ compute_metric = '{}_time'.format(kernel)
compute_elapse_times = [(compute_end - start) * 1000 / self._args.num_steps]
if not self._process_numeric_result(compute_metric, compute_elapse_times):
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
index 6b9785064..4ab0eb2fc 100644
--- a/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_gemm_flops_performance.py
@@ -27,21 +27,21 @@ def __init__(self, name, parameters=''):
# TODO - To support more architecutres, currently only support compute capability = 7.0 and 8.0
self.__kernel_map = {
7.0: {
- 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
- 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
- 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
- 'FP16_TC': 'cutlass_tensorop_h884gemm_256x128_32x2_*',
+ 'fp64': 'cutlass_simt_dgemm_128x128_8x2_*',
+ 'fp32': 'cutlass_simt_sgemm_128x128_8x2_*',
+ 'fp16': 'cutlass_simt_hgemm_256x128_8x2_*',
+ 'fp16_tc': 'cutlass_tensorop_h884gemm_256x128_32x2_*',
},
8.0: {
- 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
- 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
- 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
- 'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
- 'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
- 'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
- 'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
- 'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
- 'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
+ 'fp64': 'cutlass_simt_dgemm_128x128_8x2_*',
+ 'fp32': 'cutlass_simt_sgemm_128x128_8x2_*',
+ 'fp16': 'cutlass_simt_hgemm_256x128_8x2_*',
+ 'fp64_tc': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
+ 'tf32_tc': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
+ 'bf16_tc': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
+ 'fp16_tc': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
+ 'int8_tc': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
+ 'int4_tc': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
}
}
self.__parse_logline = [
@@ -128,7 +128,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
)
return False
- self._result.add_result(precision, max(flops))
+ self._result.add_result(self._metric_map[precision], max(flops))
return True
diff --git a/superbench/benchmarks/micro_benchmarks/disk_performance.py b/superbench/benchmarks/micro_benchmarks/disk_performance.py
index 095b054e6..1af56fcef 100644
--- a/superbench/benchmarks/micro_benchmarks/disk_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/disk_performance.py
@@ -209,7 +209,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
lat_unit_prefix = '%s_%s' % (io_type_prefix, lat_unit)
for lat_percentile in ['95.000000', '99.000000', '99.900000']:
lat = fio_output['jobs'][0][io_type][lat_unit]['percentile'][lat_percentile]
- self._result.add_result('%s_%s' % (lat_unit_prefix, lat_percentile), float(lat))
+ self._result.add_result('%s_%s' % (lat_unit_prefix, lat_percentile[:-5]), float(lat))
break
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
diff --git a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py
index 09cdb5bc6..a1b283b1e 100644
--- a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py
@@ -20,9 +20,24 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters)
self._support_precisions = [
- 'FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC'
+ 'fp64', 'fp32', 'fp16', 'fp64_tc', 'tf32_tc', 'bf16_tc', 'fp16_tc', 'int8_tc', 'int4_tc'
]
self._precision_need_to_run = list()
+ self._metric_map = {
+ 'fp64': 'fp64_flops',
+ 'fp32': 'fp32_flops',
+ 'fp16': 'fp16_flops',
+ 'fp64_tc': 'fp64_tc_flops',
+ 'tf32_tc': 'tf32_tc_flops',
+ 'bf16_tc': 'bf16_tc_flops',
+ 'fp16_tc': 'fp16_tc_flops',
+ 'int8_tc': 'int8_tc_iops',
+ 'int4_tc': 'int4_tc_iops',
+ 'fp32_xdlops': 'fp32_xdlops_flops',
+ 'fp16_xdlops': 'fp16_xdlops_flops',
+ 'bf16_xdlops': 'bf16_xdlops_flops',
+ 'int8_xdlops': 'int8_xdlops_iops'
+ }
def add_parser_arguments(self):
"""Add the specified arguments."""
@@ -76,7 +91,7 @@ def _preprocess(self):
if len(self._args.precision) == 0:
self._precision_need_to_run = self._support_precisions
else:
- self._args.precision = [p.upper() for p in self._args.precision]
+ self._args.precision = [p.lower() for p in self._args.precision]
for p in self._args.precision:
if p not in self._support_precisions:
logger.warning(
diff --git a/superbench/benchmarks/micro_benchmarks/gpcnet_performance.py b/superbench/benchmarks/micro_benchmarks/gpcnet_performance.py
index 17ef1aa19..7a194173b 100644
--- a/superbench/benchmarks/micro_benchmarks/gpcnet_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gpcnet_performance.py
@@ -24,6 +24,25 @@ def __init__(self, name, parameters=''):
self._bin_name = 'network_test'
if self._name == 'gpcnet-network-load-test':
self._bin_name = 'network_load_test'
+ self.__metrics = {
+ 'RRTwo-sidedLat(8B)': 'rr_two-sided_lat',
+ 'RRGetLat(8B)': 'rr_get_lat',
+ 'RRTwo-sidedBW(131072B)': 'rr_two-sided_bw',
+ 'RRPutBW(131072B)': 'rr_put_bw',
+ 'RRTwo-sidedBW+Sync(131072B)': 'rr_two-sided+sync_bw',
+ 'NatTwo-sidedBW(131072B)': 'nat_two-sided_bw',
+ 'MultipleAllreduce(8B)': 'multiple_allreduce_time',
+ 'MultipleAlltoall(4096B)': 'multiple_alltoall_bw',
+ 'GetBcast(4096B)': 'get_bcast_bw',
+ 'PutIncast(4096B)': 'put_incast_bw',
+ 'Two-sidedIncast(4096B)': 'two-sided_incast_bw',
+ 'Alltoall(4096B)': 'alltoall_bw'
+ }
+ self.__metrics_x = {
+ 'RRTwo-sidedLat(8B)': 'rr_two-sided_lat_x',
+ 'RRTwo-sidedBW+Sync(131072B)': 'rr_two-sided+sync_bw_x',
+ 'MultipleAllreduce(8B)': 'multiple_allreduce_x',
+ }
def add_parser_arguments(self):
"""Add the specified arguments."""
@@ -70,18 +89,29 @@ def _process_raw_result(self, idx, raw_output): # noqa: C901
items = [item.strip() for item in items]
# Get table name
if len(items) == 3 and 'Tests' in items[1]:
- test_name = items[1].replace(' ', '')
+ test_name = items[1].replace(' ', '_').lower()
# Get the line of the table labels
elif 'Avg' in line or 'Name' in line:
labels = items
# Get values related to the labels
else:
- name_prefix = items[1].replace(' ', '')
- for i in range(2, len(items) - 1):
- if labels[i] != 'Units':
- self._result.add_result(
- test_name + '_' + name_prefix + '_' + labels[i], float(items[i].strip('X'))
- )
+ if self._name == 'gpcnet-network-test':
+ name_prefix = items[1].replace(' ', '')
+ for i in range(2, len(items) - 1):
+ if labels[i] != 'Units':
+ self._result.add_result(
+ self.__metrics[name_prefix] + '_' + labels[i].lower(),
+ float(items[i].strip('X'))
+ )
+ elif test_name == 'network_tests_running_with_congestion_tests_-_key_results' \
+ and self._name == 'gpcnet-network-load-test':
+ name_prefix = items[1].replace(' ', '')
+ for i in range(2, len(items) - 1):
+ if labels[i] != 'Units':
+ self._result.add_result(
+ self.__metrics_x[name_prefix] + '_' + labels[i].lower(),
+ float(items[i].strip('X'))
+ )
elif 'ERROR: this application must be run on at least' in raw_output:
return True
else:
diff --git a/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py b/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py
index 2dd74851f..25870b187 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py
@@ -100,7 +100,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
output_lines = [x.strip() for x in raw_output.strip().splitlines()]
for output_line in output_lines:
tag, bw_str = output_line.split()
- self._result.add_result(tag, float(bw_str))
+ self._result.add_result(tag + '_bw', float(bw_str))
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
diff --git a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
index 076f34b18..d60e39203 100644
--- a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
@@ -201,8 +201,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
continue
# Extract value from the line
size = int(values[0])
- avg_bw = float(values[-2])
- metric = 'IB_{}_{}_Avg_{}'.format(self._args.commands[cmd_idx], size, str(self._args.ib_index))
+ avg_bw = float(values[-2]) / 1000
+ metric = 'ib_{}_{}_ib{}_bw'.format(self._args.commands[cmd_idx], size, str(self._args.ib_index))
# Filter useless value in client output
if metric not in metric_set:
metric_set.add(metric)
diff --git a/superbench/benchmarks/micro_benchmarks/ib_validation_performance.py b/superbench/benchmarks/micro_benchmarks/ib_validation_performance.py
index 28fb574ba..e2a17f343 100644
--- a/superbench/benchmarks/micro_benchmarks/ib_validation_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ib_validation_performance.py
@@ -207,19 +207,22 @@ def __prepare_config(self, node_num):
# Use the config file defined in args
else:
self.__config_path = self._args.config
+ # Read the hostfile
+ with open(self._args.hostfile, 'r') as f:
+ hosts = f.readlines()
# Read the config file and check if it's empty and valid
with open(self.__config_path, 'r') as f:
lines = f.readlines()
- for line in lines:
- pairs = line.strip().strip(';').split(';')
- # Check format of config
- for pair in pairs:
- pair = pair.split(',')
- if len(pair) != 2:
- return False
- pair[0] = int(pair[0])
- pair[1] = int(pair[1])
- self.__config.extend(pairs)
+ for line in lines:
+ pairs = line.strip().strip(';').split(';')
+ # Check format of config
+ for pair in pairs:
+ pair = pair.split(',')
+ if len(pair) != 2:
+ return False
+ pair[0] = int(pair[0])
+ pair[1] = int(pair[1])
+ self.__config.append('{}_{}'.format(hosts[pair[0]].strip(), hosts[pair[1]].strip()))
except BaseException as e:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error('Failed to generate and check config - benchmark: {}, message: {}.'.format(self._name, str(e)))
@@ -345,6 +348,8 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
content = raw_output.splitlines()
line_index = 0
config_index = 0
+ command = self._args.commands[cmd_idx]
+ suffix = command.split('_')[-1]
try:
result_index = -1
for index, line in enumerate(content):
@@ -357,11 +362,22 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
content = content[result_index:]
for line in content:
line = list(filter(None, line.strip().split(',')))
+ pair_index = 0
for item in line:
- metric = '{line}-{pair}'.format(line=str(line_index), pair=self.__config[config_index])
- self._result.add_result(metric, float(item))
+ metric = '{command}_{line}_{pair}_{host}_{suffix}'.format(
+ command=command,
+ line=str(line_index),
+ pair=pair_index,
+ host=self.__config[config_index],
+ suffix=suffix
+ )
+ value = float(item)
+ if 'bw' in command:
+ value = value / 1000
+ self._result.add_result(metric, value)
valid = True
config_index += 1
+ pair_index += 1
line_index += 1
except Exception:
valid = False
diff --git a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py
index dbe056d84..60a89ba6b 100644
--- a/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py
+++ b/superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py
@@ -100,8 +100,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
)
return False
- self._result.add_result('event_overhead', result[0])
- self._result.add_result('wall_overhead', result[1])
+ self._result.add_result('event_time', result[0])
+ self._result.add_result('wall_time', result[1])
return True
diff --git a/superbench/benchmarks/micro_benchmarks/memory_bw_performance_base.py b/superbench/benchmarks/micro_benchmarks/memory_bw_performance_base.py
index 13115d5dc..5cc732cf0 100644
--- a/superbench/benchmarks/micro_benchmarks/memory_bw_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/memory_bw_performance_base.py
@@ -20,7 +20,7 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters)
self._mem_types = ['htod', 'dtoh', 'dtod']
- self._metrics = ['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']
+ self._metrics = ['h2d_bw', 'd2h_bw', 'd2d_bw']
self._memory = ['pinned', 'unpinned']
self._parse_logline_map = {'htod': 'H2D', 'dtoh': 'D2H', 'dtod': 'D2D'}
diff --git a/superbench/benchmarks/micro_benchmarks/rocm_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/rocm_gemm_flops_performance.py
index 9315bd922..76c86fc45 100644
--- a/superbench/benchmarks/micro_benchmarks/rocm_gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/rocm_gemm_flops_performance.py
@@ -22,13 +22,13 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters)
self._bin_name = 'rocblas-bench'
- self._support_precisions = ['FP64', 'FP32_xDLOPS', 'FP16_xDLOPS', 'BF16_xDLOPS', 'INT8_xDLOPS']
+ self._support_precisions = ['fp64', 'fp32_xdlops', 'fp16_xdlops', 'bf16_xdlops', 'int8_xdlops']
self.__precision_and_kernel_map = {
- 'FP64': '-r f64_r -f gemm',
- 'FP32_xDLOPS': '-r f32_r -f gemm_ex --compute_type f32_r',
- 'FP16_xDLOPS': '-r f16_r -f gemm_ex --compute_type f32_r',
- 'BF16_xDLOPS': '-r bf16_r -f gemm_ex --compute_type f32_r',
- 'INT8_xDLOPS': '--a_type i8_r --b_type i8_r --c_type i32_r --d_type i32_r -f gemm_ex --compute_type i32_r'
+ 'fp64': '-r f64_r -f gemm',
+ 'fp32_xdlops': '-r f32_r -f gemm_ex --compute_type f32_r',
+ 'fp16_xdlops': '-r f16_r -f gemm_ex --compute_type f32_r',
+ 'bf16_xdlops': '-r bf16_r -f gemm_ex --compute_type f32_r',
+ 'int8_xdlops': '--a_type i8_r --b_type i8_r --c_type i32_r --d_type i32_r -f gemm_ex --compute_type i32_r'
}
def add_parser_arguments(self):
@@ -154,7 +154,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
)
return False
- self._result.add_result(precision, gflops)
+ self._result.add_result(self._metric_map[precision], gflops)
return True
diff --git a/superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py b/superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
index 201e4a4d1..b9c79fef3 100644
--- a/superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
@@ -64,21 +64,18 @@ def _process_raw_result(self, cmd_idx, raw_output):
mem_bw = -1
value_index = -1
- size_index = -1
valid = True
content = raw_output.splitlines()
try:
+ metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])]
parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]]
for line in content:
if parse_logline in line and value_index != -1:
line = line.split()
- mem_bw = float(line[value_index])
- metric = self._args.mem_type[cmd_idx] + '_' + line[size_index]
- self._result.add_result(metric, mem_bw)
+ mem_bw = max(mem_bw, float(line[value_index]))
elif 'mean' in line:
line = line.split()
value_index = line.index('mean')
- size_index = line.index('atts')
except BaseException:
valid = False
finally:
@@ -89,7 +86,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
)
)
return False
-
+ self._result.add_result(metric, mem_bw)
return True
diff --git a/superbench/benchmarks/micro_benchmarks/sharding_matmul.py b/superbench/benchmarks/micro_benchmarks/sharding_matmul.py
index 269813126..2f990eb95 100644
--- a/superbench/benchmarks/micro_benchmarks/sharding_matmul.py
+++ b/superbench/benchmarks/micro_benchmarks/sharding_matmul.py
@@ -256,7 +256,7 @@ def _benchmark(self):
logger.error('Unknown sharding mode - benchmark: {}, mode: {}.'.format(self._name, mode))
return False
- metric = '{}'.format(mode)
+ metric = '{}_time'.format(mode)
if not self._process_numeric_result(metric, elapse_times, reduce_type=ReduceType.MAX):
return False
diff --git a/superbench/benchmarks/micro_benchmarks/tcp_connectivity.py b/superbench/benchmarks/micro_benchmarks/tcp_connectivity.py
index f086a275e..1c0135f2f 100644
--- a/superbench/benchmarks/micro_benchmarks/tcp_connectivity.py
+++ b/superbench/benchmarks/micro_benchmarks/tcp_connectivity.py
@@ -182,12 +182,12 @@ def _process_raw_result(self, idx, raw_output):
mininum = float(res[labels.index('Minimum')].strip('ms'))
maximum = float(res[labels.index('Maximum')].strip('ms'))
average = float(res[labels.index('Average')].strip('ms'))
- self._result.add_result('Successed_' + host, suc)
- self._result.add_result('Failed_' + host, fail)
- self._result.add_result('Success_Rate_' + host, rate)
- self._result.add_result('Minimum_' + host, mininum)
- self._result.add_result('Maximum_' + host, maximum)
- self._result.add_result('Average_' + host, average)
+ self._result.add_result(host + '_successed_count', suc)
+ self._result.add_result(host + '_failed_count', fail)
+ self._result.add_result(host + '_success_rate', rate)
+ self._result.add_result(host + '_time_min', mininum)
+ self._result.add_result(host + '_time_max', maximum)
+ self._result.add_result(host + '_time_avg', average)
except Exception as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, address: {}, raw output: {}, message: {}.'.
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index c00634097..523cde0e3 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -131,16 +131,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
success = False
try:
+ model = self._args.pytorch_models[cmd_idx]
for line in raw_output.strip().splitlines():
line = line.strip()
if '[I] mean:' in line or '[I] percentile:' in line:
tag = 'mean' if '[I] mean:' in line else '99'
lats = re.findall(r'(\d+\.\d+) ms', line)
if len(lats) == 1:
- self._result.add_result(f'gpu_lat_ms_{tag}', float(lats[0]))
+ self._result.add_result(f'{model}_gpu_time_{tag}', float(lats[0]))
elif len(lats) == 2:
- self._result.add_result(f'host_lat_ms_{tag}', float(lats[0]))
- self._result.add_result(f'end_to_end_lat_ms_{tag}', float(lats[1]))
+ self._result.add_result(f'{model}_host_time_{tag}', float(lats[0]))
+ self._result.add_result(f'{model}_end_to_end_time_{tag}', float(lats[1]))
success = True
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index 451c4c246..a0fdbe9fb 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -373,7 +373,10 @@ def __process_model_result(self, model_action, precision, step_times):
)
return False
- metric = 'steptime_{}_{}'.format(model_action, precision)
+ precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'}
+ if precision.value in precision_metric.keys():
+ precision = precision_metric[precision.value]
+ metric = '{}_{}_step_time'.format(precision, model_action)
self._result.add_raw_data(metric, step_times)
avg = statistics.mean(step_times)
self._result.add_result(metric, avg, reduce_type=ReduceType.MAX if model_action is ModelAction.TRAIN else None)
@@ -381,7 +384,7 @@ def __process_model_result(self, model_action, precision, step_times):
# The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec.
millisecond_per_second = 1000
throughput = [millisecond_per_second / step_time * self._args.batch_size for step_time in step_times]
- metric = 'throughput_{}_{}'.format(model_action, precision)
+ metric = '{}_{}_throughput'.format(precision, model_action)
self._result.add_raw_data(metric, throughput)
avg = statistics.mean(throughput)
self._result.add_result(metric, avg, reduce_type=ReduceType.MIN if model_action is ModelAction.TRAIN else None)
diff --git a/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py b/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
index 317b82f86..c64cf7343 100644
--- a/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
+++ b/tests/benchmarks/docker_benchmarks/test_rocm_onnxruntime_performance.py
@@ -44,13 +44,13 @@ def test_rocm_onnxruntime_performance():
"samples_per_second": 274.455
"""
assert (benchmark._process_raw_result(0, raw_output))
- assert (benchmark.result['bert_large_uncased_ngpu_1'][0] == 21.829)
- assert (benchmark.result['bert_large_uncased_ngpu_8'][0] == 147.181)
- assert (benchmark.result['distilbert_base_uncased_ngpu_1'][0] == 126.827)
- assert (benchmark.result['distilbert_base_uncased_ngpu_8'][0] == 966.796)
- assert (benchmark.result['gpt2_ngpu_1'][0] == 20.46)
- assert (benchmark.result['gpt2_ngpu_8'][0] == 151.089)
- assert (benchmark.result['facebook_bart_large_ngpu_1'][0] == 66.171)
- assert (benchmark.result['facebook_bart_large_ngpu_8'][0] == 370.343)
- assert (benchmark.result['roberta_large_ngpu_1'][0] == 37.103)
- assert (benchmark.result['roberta_large_ngpu_8'][0] == 274.455)
+ assert (benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829)
+ assert (benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181)
+ assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] == 126.827)
+ assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] == 966.796)
+ assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46)
+ assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089)
+ assert (benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171)
+ assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] == 370.343)
+ assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103)
+ assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)
diff --git a/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py
index a9a9673a3..caf387093 100644
--- a/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_gemm_flops_performance.py
@@ -38,7 +38,7 @@ def test_flops_performance_cuda(self):
# Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE.
benchmark = benchmark_class(
benchmark_name,
- parameters='--num_warmup 200 --n 1024 --k 512 --m 2048 --precision FP32 TF32_TC FP16_TC INT8_TC'
+ parameters='--num_warmup 200 --n 1024 --k 512 --m 2048 --precision fp32 tf32_tc fp16_tc int8_tc'
)
ret = benchmark._preprocess()
@@ -59,11 +59,11 @@ def test_flops_performance_cuda(self):
assert (benchmark._args.n == 1024)
assert (benchmark._args.k == 512)
assert (benchmark._args.m == 2048)
- assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
- benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
+ assert (benchmark._args.precision == ['fp32', 'tf32_tc', 'fp16_tc', 'int8_tc'])
+ benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['fp32', 'tf32_tc', 'fp16_tc', 'int8_tc']
# Check results and metrics.
- raw_output_FP32 = """
+ raw_output_fp32 = """
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -72,7 +72,7 @@ def test_flops_performance_cuda(self):
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4
"""
- raw_output_TF32_TC = """
+ raw_output_tf32_tc = """
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -81,7 +81,7 @@ def test_flops_performance_cuda(self):
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677
"""
- raw_output_FP16_TC = """
+ raw_output_fp16_tc = """
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
@@ -90,13 +90,13 @@ def test_flops_performance_cuda(self):
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048
"""
- assert (benchmark._process_raw_result(0, raw_output_FP32))
- assert (benchmark._process_raw_result(1, raw_output_TF32_TC))
- assert (benchmark._process_raw_result(2, raw_output_FP16_TC))
+ assert (benchmark._process_raw_result(0, raw_output_fp32))
+ assert (benchmark._process_raw_result(1, raw_output_tf32_tc))
+ assert (benchmark._process_raw_result(2, raw_output_fp16_tc))
- assert (benchmark.result['FP32'][0] == 18369.7)
- assert (benchmark.result['TF32_TC'][0] == 128677)
- assert (benchmark.result['FP16_TC'][0] == 281048)
+ assert (benchmark.result['fp32_flops'][0] == 18369.7)
+ assert (benchmark.result['tf32_tc_flops'][0] == 128677)
+ assert (benchmark.result['fp16_tc_flops'][0] == 281048)
# Negative case - Add invalid raw output.
assert (benchmark._process_raw_result(3, 'Invalid raw output') is False)
diff --git a/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
index af4c612d7..d72070e19 100644
--- a/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
@@ -328,7 +328,7 @@ def test_cuda_memory_bw_performance(self):
bandwidthTest-D2D, Bandwidth = 762.8 GB/s, Time = 0.00009 s, Size = 68000000 bytes, NumDevsUsed = 1
Result = PASS
"""
- for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+ for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
assert (benchmark._process_raw_result(i, raw_output[i]))
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
diff --git a/tests/benchmarks/micro_benchmarks/test_disk_performance.py b/tests/benchmarks/micro_benchmarks/test_disk_performance.py
index 52a220b32..710f5a23d 100644
--- a/tests/benchmarks/micro_benchmarks/test_disk_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_disk_performance.py
@@ -519,19 +519,19 @@ def test_disk_performance_result_parsing(self):
assert (1 == len(benchmark.result[jobname_prefix + '_write_iops']))
assert (85066.128925 == benchmark.result[jobname_prefix + '_write_iops'][0])
- assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_95.000000']))
- assert (1941504 == benchmark.result[jobname_prefix + '_read_lat_ns_95.000000'][0])
- assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.000000']))
- assert (2244608 == benchmark.result[jobname_prefix + '_read_lat_ns_99.000000'][0])
- assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.900000']))
- assert (3620864 == benchmark.result[jobname_prefix + '_read_lat_ns_99.900000'][0])
-
- assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_95.000000']))
- assert (1908736 == benchmark.result[jobname_prefix + '_write_lat_ns_95.000000'][0])
- assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.000000']))
- assert (2072576 == benchmark.result[jobname_prefix + '_write_lat_ns_99.000000'][0])
- assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.900000']))
- assert (2605056 == benchmark.result[jobname_prefix + '_write_lat_ns_99.900000'][0])
+ assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_95.0']))
+ assert (1941504 == benchmark.result[jobname_prefix + '_read_lat_ns_95.0'][0])
+ assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.0']))
+ assert (2244608 == benchmark.result[jobname_prefix + '_read_lat_ns_99.0'][0])
+ assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.9']))
+ assert (3620864 == benchmark.result[jobname_prefix + '_read_lat_ns_99.9'][0])
+
+ assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_95.0']))
+ assert (1908736 == benchmark.result[jobname_prefix + '_write_lat_ns_95.0'][0])
+ assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.0']))
+ assert (2072576 == benchmark.result[jobname_prefix + '_write_lat_ns_99.0'][0])
+ assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.9']))
+ assert (2605056 == benchmark.result[jobname_prefix + '_write_lat_ns_99.9'][0])
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
diff --git a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
index c44d7c94b..8ba1b7552 100644
--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
@@ -72,7 +72,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
return True
-def test_memory_bw_performance_base():
+def test_gemm_flops_performance_base():
"""Test GemmFlopsBenchmark."""
# Positive case - memory=pinned.
benchmark = FakeGemmFlopsBenchmark('fake')
@@ -81,49 +81,49 @@ def test_memory_bw_performance_base():
assert (benchmark.return_code == ReturnCode.SUCCESS)
# Check command list
expected_command = [
- 'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP64_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision TF32_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision BF16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision INT8_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision INT4_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+ 'echo "--precision fp64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp64_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision tf32_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision bf16_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp16_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision int8_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision int4_tc --m 16384 --n 16384 --k 16384 --num_warmup 5"'
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
for i, metric in enumerate(
- ['FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC']
+ ['fp64', 'fp32', 'fp16', 'fp64_tc', 'tf32_tc', 'bf16_tc', 'fp16_tc', 'int8_tc', 'int4_tc']
):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
# Positive case - memory=unpinned.
- benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 FP32 FP16')
+ benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision fp64 fp32 fp16')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run())
assert (benchmark.return_code == ReturnCode.SUCCESS)
# Check command list
expected_command = [
- 'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
- 'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+ 'echo "--precision fp64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+ 'echo "--precision fp16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
- for i, metric in enumerate(['FP64', 'FP32', 'FP16']):
+ for i, metric in enumerate(['fp64', 'fp32', 'fp16']):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
- benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 BF64')
+ benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision fp64 bf64')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run() is True)
# Negative case - INVALID_ARGUMENT.
- benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision BF64')
+ benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision bf64')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run() is False)
assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)
diff --git a/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py b/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
index 74c27e1b0..f58647017 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpcnet_performance.py
@@ -98,14 +98,19 @@ def test_gpcnet_network_test(self):
# Check function process_raw_data.
# Positive case - valid raw output.
assert (benchmark._process_raw_result(0, raw_output))
- test_name = 'IsolatedNetworkTests'
metric_list = [
- 'RRTwo-sidedLat(8B)', 'RRGetLat(8B)', 'RRTwo-sidedBW(131072B)', 'RRPutBW(131072B)',
- 'RRTwo-sidedBW+Sync(131072B)', 'NatTwo-sidedBW(131072B)', 'MultipleAllreduce(8B)', 'MultipleAlltoall(4096B)'
+ 'rr_two-sided_lat',
+ 'rr_get_lat',
+ 'rr_two-sided_bw',
+ 'rr_put_bw',
+ 'rr_two-sided+sync_bw',
+ 'nat_two-sided_bw',
+ 'multiple_allreduce_time',
+ 'multiple_alltoall_bw',
]
for metric_medium in metric_list:
- for suffix in ['Avg', '99%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
+ for suffix in ['avg', '99%']:
+ metric = metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
@@ -253,58 +258,10 @@ def test_gpcnet_network_load(self): # noqa: C901
assert (len(benchmark.result) == benchmark.default_metric_count)
# Positive case - valid raw output.
assert (benchmark._process_raw_result(0, raw_output))
- test_name = 'IsolatedNetworkTests'
- metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
+ metric_list = ['rr_two-sided_lat_x', 'rr_two-sided+sync_bw_x', 'multiple_allreduce_x']
for metric_medium in metric_list:
- for suffix in ['Max', 'Min', 'Avg', '99.9%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
- assert (metric in benchmark.result)
- assert (len(benchmark.result[metric]) == 1)
- assert (isinstance(benchmark.result[metric][0], numbers.Number))
- test_name = 'IsolatedCongestionTests'
- metric_list = ['GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)']
- for metric_medium in metric_list:
- for suffix in ['Max', 'Min', 'Avg', '99.9%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
- assert (metric in benchmark.result)
- assert (len(benchmark.result[metric]) == 1)
- assert (isinstance(benchmark.result[metric][0], numbers.Number))
- test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedLatNetworkTest)'
- metric_list = [
- 'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'RRTwo-sidedLat(8B)'
- ]
- for metric_medium in metric_list:
- for suffix in ['Max', 'Min', 'Avg', '99.9%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
- assert (metric in benchmark.result)
- assert (len(benchmark.result[metric]) == 1)
- assert (isinstance(benchmark.result[metric][0], numbers.Number))
- test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedBW+SyncNetworkTest)'
- metric_list = [
- 'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)',
- 'RRTwo-sidedBW+Sync(131072B)'
- ]
- for metric_medium in metric_list:
- for suffix in ['Max', 'Min', 'Avg', '99.9%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
- assert (metric in benchmark.result)
- assert (len(benchmark.result[metric]) == 1)
- assert (isinstance(benchmark.result[metric][0], numbers.Number))
- test_name = 'NetworkTestsrunningwithCongestionTests(MultipleAllreduceNetworkTest)'
- metric_list = [
- 'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'MultipleAllreduce(8B)'
- ]
- for metric_medium in metric_list:
- for suffix in ['Max', 'Min', 'Avg', '99.9%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
- assert (metric in benchmark.result)
- assert (len(benchmark.result[metric]) == 1)
- assert (isinstance(benchmark.result[metric][0], numbers.Number))
- test_name = 'NetworkTestsrunningwithCongestionTests-KeyResults'
- metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
- for metric_medium in metric_list:
- for suffix in ['Avg', '99%']:
- metric = test_name + '_' + metric_medium + '_' + suffix
+ for suffix in ['avg', '99%']:
+ metric = metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
index 7c9932c93..28b6d18ba 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
@@ -119,8 +119,8 @@ def _test_gpu_copy_bw_performance_result_parsing(self, platform):
else:
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
- assert (output_key in test_raw_output_dict)
- assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
+ assert (output_key.strip('_bw') in test_raw_output_dict)
+ assert (test_raw_output_dict[output_key.strip('_bw')] == benchmark.result[output_key][0])
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
diff --git a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
index 9a03f43da..b325508a7 100644
--- a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -158,7 +158,7 @@ def test_ib_loopback_all_sizes(self, mock_ib_devices, mock_numa_cores, mock_port
metric_list = []
for ib_command in benchmark._args.commands:
for size in ['8388608', '4194304', '1024', '2']:
- metric = 'IB_{}_{}_Avg_{}'.format(ib_command, size, str(benchmark._args.ib_index))
+ metric = 'ib_{}_{}_ib{}_bw'.format(ib_command, size, str(benchmark._args.ib_index))
metric_list.append(metric)
for metric in metric_list:
assert (metric in benchmark.result)
@@ -270,7 +270,7 @@ def test_ib_loopback_8M_size(self, mock_ib_devices, mock_numa_cores, mock_port):
# Positive case - valid raw output.
metric_list = []
for ib_command in benchmark._args.commands:
- metric = 'IB_{}_8388608_Avg_{}'.format(ib_command, str(benchmark._args.ib_index))
+ metric = 'ib_{}_8388608_ib{}_bw'.format(ib_command, str(benchmark._args.ib_index))
metric_list.append(metric)
for metric in metric_list:
assert (metric in benchmark.result)
diff --git a/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py
index 220fa3e2e..89271bb8e 100644
--- a/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_traffic_performance.py
@@ -27,6 +27,9 @@ def setUp(self):
def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
self.__binary_file.unlink()
+ p = Path('hostfile')
+ if p.is_file():
+ p.unlink()
def test_generate_config(self): # noqa: C901
"""Test util functions ."""
@@ -126,15 +129,18 @@ def test_ib_traffic_performance(self, mock_ib_devices):
# Check preprocess
# Negative cases
- parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+ parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
mock_ib_devices.return_value = None
ret = benchmark._preprocess()
assert (ret is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_MPI_INIT_FAILURE)
+ hosts = ['node0\n', 'node1\n', 'node2\n', 'node3\n']
+ with open('hostfile', 'w') as f:
+ f.writelines(hosts)
os.environ['OMPI_COMM_WORLD_SIZE'] = '4'
- parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+ parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
mock_ib_devices.return_value = None
ret = benchmark._preprocess()
@@ -143,21 +149,21 @@ def test_ib_traffic_performance(self, mock_ib_devices):
# Positive cases
os.environ['OMPI_COMM_WORLD_SIZE'] = '3'
- parameters = '--ib_index 0 --iters 2000 --pattern one-to-one'
+ parameters = '--ib_index 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
mock_ib_devices.return_value = ['mlx5_0']
ret = benchmark._preprocess()
assert (ret is True)
# Generate config
- parameters = '--ib_index 0 --iters 2000 --msg_size 33554432'
+ parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
os.environ['OMPI_COMM_WORLD_SIZE'] = '4'
mock_ib_devices.return_value = ['mlx5_0']
ret = benchmark._preprocess()
Path('config.txt').unlink()
assert (ret)
- expect_command = 'ib_validation --hostfile /root/hostfile --cmd_prefix "ib_write_bw -F ' + \
+ expect_command = 'ib_validation --hostfile hostfile --cmd_prefix "ib_write_bw -F ' + \
'--iters=2000 -d mlx5_0 -s 33554432" --input_config ' + os.getcwd() + '/config.txt'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
assert (command == expect_command)
@@ -167,14 +173,14 @@ def test_ib_traffic_performance(self, mock_ib_devices):
with open('test_config.txt', 'w') as f:
for line in config:
f.write(line + '\n')
- parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --config test_config.txt'
+ parameters = '--ib_index 0 --iters 2000 --msg_size 33554432 --config test_config.txt --hostfile hostfile'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
mock_ib_devices.return_value = ['mlx5_0']
ret = benchmark._preprocess()
Path('test_config.txt').unlink()
assert (ret)
- expect_command = 'ib_validation --hostfile /root/hostfile --cmd_prefix "ib_write_bw -F ' + \
+ expect_command = 'ib_validation --hostfile hostfile --cmd_prefix "ib_write_bw -F ' + \
'--iters=2000 -d mlx5_0 -s 33554432" --input_config test_config.txt'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
diff --git a/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py b/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
index 2595748d6..e75aa426c 100644
--- a/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
+++ b/tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
@@ -36,7 +36,7 @@ def test_kernel_launch_overhead():
assert ('raw_output_0' in benchmark.raw_data)
assert (len(benchmark.raw_data['raw_output_0']) == 1)
assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))
- for metric in ['event_overhead', 'wall_overhead']:
+ for metric in ['event_time', 'wall_time']:
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
diff --git a/tests/benchmarks/micro_benchmarks/test_matmul.py b/tests/benchmarks/micro_benchmarks/test_matmul.py
index 670e65085..d267dd2ff 100644
--- a/tests/benchmarks/micro_benchmarks/test_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_matmul.py
@@ -35,6 +35,6 @@ def test_pytorch_matmul():
# Check results and metrics.
assert (benchmark.run_count == 2)
assert (benchmark.return_code == ReturnCode.SUCCESS)
- assert (len(benchmark.raw_data['nosharding']) == benchmark.run_count)
- assert (len(benchmark.raw_data['nosharding'][0]) == benchmark._args.num_steps)
- assert (len(benchmark.result['nosharding']) == benchmark.run_count)
+ assert (len(benchmark.raw_data['nosharding_time']) == benchmark.run_count)
+ assert (len(benchmark.raw_data['nosharding_time'][0]) == benchmark._args.num_steps)
+ assert (len(benchmark.result['nosharding_time']) == benchmark.run_count)
diff --git a/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py b/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
index 36cf36eda..c4c553dbd 100644
--- a/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_memory_bw_performance_base.py
@@ -83,7 +83,7 @@ def test_memory_bw_performance_base():
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
- for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+ for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
@@ -97,7 +97,7 @@ def test_memory_bw_performance_base():
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
- for i, metric in enumerate(['H2D_Mem_BW', 'D2H_Mem_BW', 'D2D_Mem_BW']):
+ for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
diff --git a/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py
index 47cf94b4e..0433bc8d1 100644
--- a/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rocm_gemm_flops_performance.py
@@ -92,11 +92,11 @@ def test_rocm_flops_performance(self):
assert (benchmark._process_raw_result(3, raw_output_BF16_X))
assert (benchmark._process_raw_result(4, raw_output_INT8_X))
- assert (benchmark.result['FP64'][0] == 10037.5)
- assert (benchmark.result['FP32_xDLOPS'][0] == 39441.6)
- assert (benchmark.result['FP16_xDLOPS'][0] == 153728)
- assert (benchmark.result['BF16_xDLOPS'][0] == 81374.3)
- assert (benchmark.result['INT8_xDLOPS'][0] == 162675)
+ assert (benchmark.result['fp64_flops'][0] == 10037.5)
+ assert (benchmark.result['fp32_xdlops_flops'][0] == 39441.6)
+ assert (benchmark.result['fp16_xdlops_flops'][0] == 153728)
+ assert (benchmark.result['bf16_xdlops_flops'][0] == 81374.3)
+ assert (benchmark.result['int8_xdlops_iops'][0] == 162675)
# Negative case - Add invalid raw output.
assert (benchmark._process_raw_result(4, 'Invalid raw output') is False)
diff --git a/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py
index 77df495d0..77bdf7145 100644
--- a/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rocm_memory_bw_performance.py
@@ -159,11 +159,11 @@ def test_rocm_memory_bw_performance(self):
might occur with a mixture of architectural capabilities.
"""
- for i, metric in enumerate(['htod_524288kB', 'htod_524288kB']):
+ for i, metric in enumerate(['h2d_bw', 'd2h_bw']):
assert (benchmark._process_raw_result(i, raw_output[i]))
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
- assert (benchmark.result['htod_524288kB'][0] == 24.6708)
- assert (benchmark.result['dtoh_524288kB'][0] == 27.9348)
+ assert (benchmark.result['h2d_bw'][0] == 25.2351)
+ assert (benchmark.result['d2h_bw'][0] == 27.9348)
diff --git a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
index 214aa20bb..715404541 100644
--- a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
@@ -44,7 +44,7 @@ def test_pytorch_sharding_matmul():
# Check results and metrics.
assert (benchmark.run_count == 2)
assert (benchmark.return_code == ReturnCode.SUCCESS)
- for metric in ['allreduce', 'allgather']:
+ for metric in ['allreduce_time', 'allgather_time']:
assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
assert (len(benchmark.result[metric]) == benchmark.run_count)
diff --git a/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py b/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py
index 063e97a14..19ea4fe79 100644
--- a/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py
+++ b/tests/benchmarks/micro_benchmarks/test_tcp_connectivity.py
@@ -52,15 +52,15 @@ def test_tcp_connectivity(self):
assert (benchmark.result)
# Check results and metrics.
- assert (benchmark.result['Successed_api.github.com'][0] == 10)
- assert (benchmark.result['Failed_api.github.com'][0] == 0)
- assert (benchmark.result['Success_Rate_api.github.com'][0] == 100.0)
- assert (isinstance(benchmark.result['Minimum_api.github.com'][0], numbers.Number))
- assert (isinstance(benchmark.result['Maximum_api.github.com'][0], numbers.Number))
- assert (isinstance(benchmark.result['Average_api.github.com'][0], numbers.Number))
- assert (isinstance(benchmark.result['Successed_localhost'][0], numbers.Number))
- assert (isinstance(benchmark.result['Failed_localhost'][0], numbers.Number))
- assert (isinstance(benchmark.result['Maximum_localhost'][0], numbers.Number))
- assert (isinstance(benchmark.result['Minimum_localhost'][0], numbers.Number))
- assert (isinstance(benchmark.result['Average_localhost'][0], numbers.Number))
+ assert (benchmark.result['api.github.com_successed_count'][0] == 10)
+ assert (benchmark.result['api.github.com_failed_count'][0] == 0)
+ assert (benchmark.result['api.github.com_success_rate'][0] == 100.0)
+ assert (isinstance(benchmark.result['api.github.com_time_min'][0], numbers.Number))
+ assert (isinstance(benchmark.result['api.github.com_time_max'][0], numbers.Number))
+ assert (isinstance(benchmark.result['api.github.com_time_avg'][0], numbers.Number))
+ assert (isinstance(benchmark.result['localhost_successed_count'][0], numbers.Number))
+ assert (isinstance(benchmark.result['localhost_failed_count'][0], numbers.Number))
+ assert (isinstance(benchmark.result['localhost_time_max'][0], numbers.Number))
+ assert (isinstance(benchmark.result['localhost_time_min'][0], numbers.Number))
+ assert (isinstance(benchmark.result['localhost_time_avg'][0], numbers.Number))
assert (benchmark.return_code == ReturnCode.SUCCESS)
diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
index 34483295c..ba8bcbc2a 100644
--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -135,9 +135,9 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
for tag in ['mean', '99']:
- self.assertEqual(0.5, benchmark.result[f'gpu_lat_ms_{tag}'][0])
- self.assertEqual(0.6, benchmark.result[f'host_lat_ms_{tag}'][0])
- self.assertEqual(1.0, benchmark.result[f'end_to_end_lat_ms_{tag}'][0])
+ self.assertEqual(0.5, benchmark.result[f'model_0_gpu_time_{tag}'][0])
+ self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0])
+ self.assertEqual(1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])
# Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py
index d71cd0058..a59e43668 100644
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -223,10 +223,10 @@ def test_train():
expected_result = (
'{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
'"start_time": null, "end_time": null, "raw_data": {'
- '"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
- '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
- '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0]}, '
- '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min"}}'
+ '"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+ '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
+ '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0]}, '
+ '"reduce_op": {"fp32_train_step_time": "max", "fp32_train_throughput": "min"}}'
)
assert (benchmark._preprocess())
assert (benchmark._ModelBenchmark__train(Precision.FLOAT32))
@@ -249,10 +249,11 @@ def test_inference():
expected_result = (
'{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, '
'"start_time": null, "end_time": null, "raw_data": {'
- '"steptime_inference_float16": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
- '"throughput_inference_float16": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
- '"result": {"return_code": [0], "steptime_inference_float16": [4.0], "throughput_inference_float16": '
- '[8000.0]}, "reduce_op": {"steptime_inference_float16": null, "throughput_inference_float16": null}}'
+ '"fp16_inference_step_time": [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]], '
+ '"fp16_inference_throughput": [[8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0, 8000.0]]}, '
+ '"result": {"return_code": [0], '
+ '"fp16_inference_step_time": [4.0], "fp16_inference_throughput": [8000.0]}, '
+ '"reduce_op": {"fp16_inference_step_time": null, "fp16_inference_throughput": null}}'
)
assert (benchmark._preprocess())
assert (benchmark._ModelBenchmark__inference(Precision.FLOAT16))
@@ -280,31 +281,31 @@ def test_benchmark():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
expected_raw_data = {
- 'steptime_train_float32': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
- 'throughput_train_float32': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
- 'steptime_train_float16': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
- 'throughput_train_float16': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
+ 'fp32_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
+ 'fp32_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]],
+ 'fp16_train_step_time': [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]],
+ 'fp16_train_throughput': [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]
}
assert (benchmark.raw_data == expected_raw_data)
expected_result = {
'return_code': [0],
- 'steptime_train_float32': [2.0],
- 'throughput_train_float32': [16000.0],
- 'steptime_train_float16': [2.0],
- 'throughput_train_float16': [16000.0]
+ 'fp32_train_step_time': [2.0],
+ 'fp32_train_throughput': [16000.0],
+ 'fp16_train_step_time': [2.0],
+ 'fp16_train_throughput': [16000.0]
}
assert (benchmark.result == expected_result)
expected_serialized_result = (
'{"name": "pytorch-fake-model", "type": "model", "run_count": 1, "return_code": 0, "start_time": null, '
- '"end_time": null, "raw_data": {"steptime_train_float32": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
- '"throughput_train_float32": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
- '"steptime_train_float16": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
- '"throughput_train_float16": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
- '"result": {"return_code": [0], "steptime_train_float32": [2.0], "throughput_train_float32": [16000.0], '
- '"steptime_train_float16": [2.0], "throughput_train_float16": [16000.0]}, '
- '"reduce_op": {"steptime_train_float32": "max", "throughput_train_float32": "min", '
- '"steptime_train_float16": "max", "throughput_train_float16": "min"}}'
+ '"end_time": null, "raw_data": {"fp32_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+ '"fp32_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]], '
+ '"fp16_train_step_time": [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]], '
+ '"fp16_train_throughput": [[16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0, 16000.0]]}, '
+ '"result": {"return_code": [0], "fp32_train_step_time": [2.0], "fp32_train_throughput": [16000.0], '
+ '"fp16_train_step_time": [2.0], "fp16_train_throughput": [16000.0]}, '
+ '"reduce_op": {"fp32_train_step_time": "max", "fp32_train_throughput": "min", '
+ '"fp16_train_step_time": "max", "fp16_train_throughput": "min"}}'
)
assert (benchmark.serialized_result == expected_serialized_result)
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
index 0db5db0c1..106b7ca77 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -188,8 +188,7 @@ def test_pytorch_base():
# Test results.
for metric in [
- 'steptime_train_float32', 'steptime_inference_float32', 'throughput_train_float32',
- 'throughput_inference_float32'
+ 'fp32_train_step_time', 'fp32_inference_step_time', 'fp32_train_throughput', 'fp32_inference_throughput'
]:
assert (len(benchmark.raw_data[metric]) == 1)
assert (len(benchmark.raw_data[metric][0]) == 64)
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_bert.py b/tests/benchmarks/model_benchmarks/test_pytorch_bert.py
index 6d1227ce3..f1e1a650d 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_bert.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_bert.py
@@ -50,9 +50,8 @@ def test_pytorch_bert_base():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
for metric in [
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_train_float16', 'throughput_train_float16',
- 'steptime_inference_float32', 'throughput_inference_float32', 'steptime_inference_float16',
- 'throughput_inference_float16'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
+ 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
]:
assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_cnn.py b/tests/benchmarks/model_benchmarks/test_pytorch_cnn.py
index f7f0048d8..095e32290 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_cnn.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_cnn.py
@@ -17,9 +17,9 @@ def test_pytorch_cnn_with_gpu():
parameters='--batch_size 1 --image_size 224 --num_classes 5 --num_warmup 2 --num_steps 4 \
--model_action train inference',
check_metrics=[
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_train_float16', 'throughput_train_float16',
- 'steptime_inference_float32', 'throughput_inference_float32', 'steptime_inference_float16',
- 'throughput_inference_float16'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
+ 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time',
+ 'fp16_inference_throughput'
]
)
@@ -32,8 +32,7 @@ def test_pytorch_cnn_no_gpu():
parameters='--batch_size 1 --image_size 224 --num_classes 5 --num_warmup 2 --num_steps 4 \
--model_action train inference --precision float32 --no_gpu',
check_metrics=[
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_inference_float32',
- 'throughput_inference_float32'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp32_inference_step_time', 'fp32_inference_throughput'
]
)
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_gpt2.py b/tests/benchmarks/model_benchmarks/test_pytorch_gpt2.py
index 353d18a23..8b38e9c76 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_gpt2.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_gpt2.py
@@ -49,9 +49,8 @@ def test_pytorch_gpt2_small():
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
for metric in [
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_train_float16', 'throughput_train_float16',
- 'steptime_inference_float32', 'throughput_inference_float32', 'steptime_inference_float16',
- 'throughput_inference_float16'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
+ 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput'
]:
assert (len(benchmark.raw_data[metric]) == benchmark.run_count)
assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_lstm.py b/tests/benchmarks/model_benchmarks/test_pytorch_lstm.py
index ec4236c07..b2ce001e5 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_lstm.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_lstm.py
@@ -16,9 +16,9 @@ def test_pytorch_lstm_with_gpu():
parameters='--batch_size 1 --num_classes 5 --seq_len 8 --num_warmup 2 --num_steps 4 \
--model_action train inference',
check_metrics=[
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_train_float16', 'throughput_train_float16',
- 'steptime_inference_float32', 'throughput_inference_float32', 'steptime_inference_float16',
- 'throughput_inference_float16'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput',
+ 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time',
+ 'fp16_inference_throughput'
]
)
@@ -30,8 +30,7 @@ def test_pytorch_lstm_no_gpu():
parameters='--batch_size 1 --num_classes 5 --seq_len 8 --num_warmup 2 --num_steps 4 \
--model_action train inference --precision float32 --no_gpu',
check_metrics=[
- 'steptime_train_float32', 'throughput_train_float32', 'steptime_inference_float32',
- 'throughput_inference_float32'
+ 'fp32_train_step_time', 'fp32_train_throughput', 'fp32_inference_step_time', 'fp32_inference_throughput'
]
)