From 981b1826e88ad868b441a00a3a0010fbba3cf981 Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Mon, 28 Jun 2021 17:07:21 +0800
Subject: [PATCH 01/10] add example and benchmark file of rdma loopback

---
 .../benchmarks/rdma_loopback_performance.py   |  23 ++
 .../benchmarks/micro_benchmarks/__init__.py   |   3 +-
 .../rdma_loopback_performance.py              | 215 ++++++++++++++++++
 superbench/common/utils/__init__.py           |   2 +-
 superbench/common/utils/network.py            |  19 ++
 .../test_computation_communication_overlap.py |   3 +-
 .../test_rdma_loopback_performance.py         |   0
 .../micro_benchmarks/test_sharding_matmul.py  |   3 +-
 tests/benchmarks/utils.py                     |  17 +-
 9 files changed, 266 insertions(+), 19 deletions(-)
 create mode 100644 examples/benchmarks/rdma_loopback_performance.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
 create mode 100644 superbench/common/utils/network.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py

diff --git a/examples/benchmarks/rdma_loopback_performance.py b/examples/benchmarks/rdma_loopback_performance.py
new file mode 100644
index 000000000..e001b00b3
--- /dev/null
+++ b/examples/benchmarks/rdma_loopback_performance.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Model benchmark example for RDMA loopback performance.
+
+Commands to run:
+  python3 examples/benchmarks/rdma_loopback_performance_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    parameters = '--ib_index 0 --numa 1'
+    context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 3b1b820f0..660eb5a9e 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -10,8 +10,9 @@
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
+from superbench.benchmarks.micro_benchmarks.rdma_loopback_performance import RDMALoopback
 
 __all__ = [
     'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
-    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda'
+    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'RDMALoopback'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
new file mode 100644
index 000000000..c6eef8af0
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
@@ -0,0 +1,215 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the RDMA loopback benchmarks."""
+
+import os
+import subprocess
+
+from superbench.common.utils import logger
+from superbench.common.utils import network
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class RDMALoopback(MicroBenchmarkWithInvoke):
+    """The RDMA loopback performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'run_perftest_loopback'
+        self.__support_ib_commands = ['ib_write_bw', 'ib_read_bw', 'ib_send_bw']
+        self.__message_sizes = ['8388608', '4194304', '2097152', '1048576']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--ib_index',
+            type=int,
+            default=0,
+            required=True,
+            help='The index of ib device.',
+        )
+        self._parser.add_argument(
+            '--n',
+            type=int,
+            default=20000,
+            required=False,
+            help='The iterations of running ib command',
+        )
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=8388608,
+            required=False,
+            help='The message size of running ib command. E.g. {}.'.format(' '.join(self.__message_sizes)),
+        )
+        self._parser.add_argument(
+            '--commands',
+            type=str,
+            nargs='+',
+            default='ib_write_bw',
+            help='The ib command used to run. E.g. {}.'.format(' '.join(self.__support_ib_commands)),
+        )
+        self._parser.add_argument(
+            '--mode',
+            type=str,
+            default='AF',
+            help='The mode used to run ib command. Eg, AF(all message size) or S(single message size)',
+        )
+        self._parser.add_argument(
+            '--numa',
+            type=int,
+            default=0,
+            required=True,
+            help='The index of numa node.',
+        )
+
+    def __get_ib_devices(self):
+        """Get available ordered IB devices in the system and filter ethernet devices."""
+        # command = 'ls -l /sys/class/infiniband/* | awk \'{print $9}\' | sort | awk -F\'/\' \'{print $5}\''
+        command = "ibv_devinfo | awk '$1 ~ /hca_id/||/link_layer:/ {print $1,$2}' |  awk '{print $2}'"
+        output = subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+        )
+        lines = output.stdout.splitlines()
+        ib_devices = []
+        for i in range(len(lines) - 1):
+            if 'InfiniBand' in lines[i + 1]:
+                ib_devices.append(lines[i])
+        return ib_devices
+
+    def __get_numa_cores(self, numa_index):
+        """Get the last two cores from different physical cpu core of NUMA<numa_index>.
+
+        Args:
+            numa_index (int): the index of numa node.
+
+        Return:
+            The last two cores from different physical cpu core of NUMA<numa_index>.
+        """
+        command = 'numactl --hardware | grep "node {} cpus:"'.format(numa_index)
+        output = subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+        )
+        return output.stdout.splitlines()[0].split(' ')
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if not isinstance(self._args.commands, list):
+            self._args.commands = [self._args.commands]
+        self._args.commands = [command.lower() for command in self._args.commands]
+
+        self._args.mode = self._args.mode.upper()
+
+        if str(self._args.size) not in self.__message_sizes:
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            logger.error(
+                'Unsupported message size - benchmark: {}, size: {}, expect: {}.'.format(
+                    self._name, self._args.size, self.__message_sizes
+                )
+            )
+            return False
+
+        command_mode = ''
+        if self._args.mode == 'AF':
+            command_mode = ' -a'
+        elif self._args.mode == 'S':
+            command_mode = ' -s ' + self._args.size
+        else:
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            logger.error(
+                'Unsupported args mode - benchmark: {}, mode: {}, expect: {}.'.format(
+                    self._name, self._args.mode, 'AF or S'
+                )
+            )
+            return False
+
+        for ib_command in self._args.commands:
+            if ib_command not in self.__support_ib_commands:
+                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                logger.error(
+                    'Unsupported ib command - benchmark: {}, command: {}, expected: {}.'.format(
+                        self._name, ib_command, self.__support_ib_commands
+                    )
+                )
+                return False
+            else:
+                command = os.path.join(self._args.bin_dir, self._bin_name)
+                numa_cores = self.__get_numa_cores(self._args.numa)
+                server_core = int(numa_cores[-1])
+                client_core = int(numa_cores[-3])
+                command += ' ' + str(server_core) + ' ' + str(client_core)
+                command += ' ' + ib_command
+                command += command_mode + ' -F'
+                command += ' --iters=' + str(self._args.n)
+                command += ' -d ' + self.__get_ib_devices()[self._args.ib_index]
+                command += ' -p ' + str(network.get_free_port())
+                self._commands.append(command)
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        ib_command = self._args.commands[cmd_idx]
+        self._result.add_raw_data(
+            'raw_output_' + str(self._args.ib_index) + '_' + ib_command + '_' + self._args.mode, raw_output
+        )
+
+        valid = False
+        content = raw_output.splitlines()
+        try:
+            metric_set = set()
+            for line in content:
+                for i in range(len(self.__message_sizes)):
+                    if self.__message_sizes[i] in line:
+                        values = list(filter(None, line.split(' ')))
+                        avg_bw = float(values[-2])
+                        metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
+                            str(self._args.ib_index), self._args.mode, self.__message_sizes[i], str(self._args.n),
+                            ib_command
+                        )
+                        if metric not in metric_set:
+                            metric_set.add(metric)
+                            self._result.add_result(metric, avg_bw)
+                            valid = True
+        except BaseException:
+            valid = False
+        finally:
+            if valid is False:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('rdma-loopback', RDMALoopback, parameters='--ib_index 0 --numa 1')
diff --git a/superbench/common/utils/__init__.py b/superbench/common/utils/__init__.py
index 708eed456..a339d41fd 100644
--- a/superbench/common/utils/__init__.py
+++ b/superbench/common/utils/__init__.py
@@ -9,4 +9,4 @@
 
 nv_helper = LazyImport('superbench.common.utils.nvidia_helper')
 
-__all__ = ['SuperBenchLogger', 'logger', 'create_output_dir', 'get_sb_config', 'LazyImport', 'nv_helper']
+__all__ = ['SuperBenchLogger', 'logger', 'create_output_dir', 'get_sb_config', 'LazyImport', 'nv_helper', 'network']
diff --git a/superbench/common/utils/network.py b/superbench/common/utils/network.py
new file mode 100644
index 000000000..4e96f24b4
--- /dev/null
+++ b/superbench/common/utils/network.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Network Utility."""
+
+import socket
+from contextlib import closing
+
+
+def get_free_port():
+    """Get a free port in current system.
+
+    Return:
+        port (int): a free port in current system.
+    """
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(('', 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
diff --git a/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py b/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
index 381dec3ac..598bdab09 100644
--- a/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
+++ b/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
@@ -7,6 +7,7 @@
 
 from tests.helper import decorator
 import tests.benchmarks.utils as utils
+from superbench.common.utils import network
 from superbench.benchmarks import BenchmarkRegistry, Framework, BenchmarkType, ReturnCode
 from superbench.benchmarks.micro_benchmarks.computation_communication_overlap \
     import ComputationCommunicationOverlap, ComputationKernelType
@@ -56,7 +57,7 @@ def test_pytorch_computation_communication_overlap_fake_distributed():
         parameters='--num_warmup 5 --num_steps 10 --ratio 5',
         framework=Framework.PYTORCH
     )
-    utils.setup_simulated_ddp_distributed_env(1, 0, utils.get_free_port())
+    utils.setup_simulated_ddp_distributed_env(1, 0, network.get_free_port())
     benchmark = BenchmarkRegistry.launch_benchmark(context)
 
     # Check basic information.
diff --git a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
index c828d7e18..03428d3b1 100644
--- a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
@@ -5,6 +5,7 @@
 
 import tests.benchmarks.utils as utils
 from tests.helper import decorator
+from superbench.common.utils import network
 from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul, ShardingMode
 
@@ -22,7 +23,7 @@ def test_pytorch_sharding_matmul():
 
     assert (BenchmarkRegistry.is_benchmark_context_valid(context))
 
-    utils.setup_simulated_ddp_distributed_env(1, 0, utils.get_free_port())
+    utils.setup_simulated_ddp_distributed_env(1, 0, network.get_free_port())
     benchmark = BenchmarkRegistry.launch_benchmark(context)
 
     # Check basic information.
diff --git a/tests/benchmarks/utils.py b/tests/benchmarks/utils.py
index 3ef897c5e..87c46b1d8 100644
--- a/tests/benchmarks/utils.py
+++ b/tests/benchmarks/utils.py
@@ -4,12 +4,11 @@
 """Utilities for benchmark tests."""
 
 import os
-import socket
-from contextlib import closing
 import multiprocessing as multiprocessing
 from multiprocessing import Process
 
 from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import network
 
 
 def clean_simulated_ddp_distributed_env():
@@ -21,18 +20,6 @@ def clean_simulated_ddp_distributed_env():
     os.environ.pop('MASTER_PORT')
 
 
-def get_free_port():
-    """Get a free port in current system.
-
-    Return:
-        port (int): a free port in current system.
-    """
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-        s.bind(('', 0))
-        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-        return s.getsockname()[1]
-
-
 def setup_simulated_ddp_distributed_env(world_size, local_rank, port):
     """Function to setup the simulated DDP distributed envionment variables."""
     os.environ['WORLD_SIZE'] = str(world_size)
@@ -58,7 +45,7 @@ def simulated_ddp_distributed_benchmark(context, world_size):
     Return:
         results (list): list of benchmark results from #world_size number of processes.
     """
-    port = get_free_port()
+    port = network.get_free_port()
     process_list = []
     multiprocessing.set_start_method('spawn')
 

From e1e5fb85ccf01a2d19f4802a622ba9d075487b99 Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Mon, 28 Jun 2021 20:56:12 +0800
Subject: [PATCH 02/10] add test

---
 dockerfile/cuda11.1.1.dockerfile              |   1 +
 .../rdma_loopback_performance.py              |  46 ++---
 superbench/benchmarks/return_code.py          |   1 +
 superbench/common/utils/network.py            |  16 ++
 .../test_rdma_loopback_performance.py         | 161 ++++++++++++++++++
 5 files changed, 196 insertions(+), 29 deletions(-)

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index 5beb60b43..833a9d6de 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -36,6 +36,7 @@ RUN apt-get update && \
     util-linux \
     vim \
     wget \
+    numactl \
     && \
     apt-get autoremove && \
     apt-get clean && \
diff --git a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
index c6eef8af0..106546a31 100644
--- a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
@@ -73,20 +73,6 @@ def add_parser_arguments(self):
             help='The index of numa node.',
         )
 
-    def __get_ib_devices(self):
-        """Get available ordered IB devices in the system and filter ethernet devices."""
-        # command = 'ls -l /sys/class/infiniband/* | awk \'{print $9}\' | sort | awk -F\'/\' \'{print $5}\''
-        command = "ibv_devinfo | awk '$1 ~ /hca_id/||/link_layer:/ {print $1,$2}' |  awk '{print $2}'"
-        output = subprocess.run(
-            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
-        )
-        lines = output.stdout.splitlines()
-        ib_devices = []
-        for i in range(len(lines) - 1):
-            if 'InfiniBand' in lines[i + 1]:
-                ib_devices.append(lines[i])
-        return ib_devices
-
     def __get_numa_cores(self, numa_index):
         """Get the last two cores from different physical cpu core of NUMA<numa_index>.
 
@@ -150,18 +136,22 @@ def _preprocess(self):
                 )
                 return False
             else:
-                command = os.path.join(self._args.bin_dir, self._bin_name)
-                numa_cores = self.__get_numa_cores(self._args.numa)
-                server_core = int(numa_cores[-1])
-                client_core = int(numa_cores[-3])
-                command += ' ' + str(server_core) + ' ' + str(client_core)
-                command += ' ' + ib_command
-                command += command_mode + ' -F'
-                command += ' --iters=' + str(self._args.n)
-                command += ' -d ' + self.__get_ib_devices()[self._args.ib_index]
-                command += ' -p ' + str(network.get_free_port())
-                self._commands.append(command)
-
+                try:
+                    command = os.path.join(self._args.bin_dir, self._bin_name)
+                    numa_cores = self.__get_numa_cores(self._args.numa)
+                    server_core = int(numa_cores[-1])
+                    client_core = int(numa_cores[-3])
+                    command += ' ' + str(server_core) + ' ' + str(client_core)
+                    command += ' ' + ib_command
+                    command += command_mode + ' -F'
+                    command += ' --iters=' + str(self._args.n)
+                    command += ' -d ' + network.get_ib_devices()[self._args.ib_index]
+                    command += ' -p ' + str(network.get_free_port())
+                    self._commands.append(command)
+                except BaseException:
+                    self._result.set_return_code(ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
+                    logger.error('Getting devices failure - benchmark: {}.'.format(self._name))
+                    return False
         return True
 
     def _process_raw_result(self, cmd_idx, raw_output):
@@ -177,9 +167,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             True if the raw output string is valid and result can be extracted.
         """
         ib_command = self._args.commands[cmd_idx]
-        self._result.add_raw_data(
-            'raw_output_' + str(self._args.ib_index) + '_' + ib_command + '_' + self._args.mode, raw_output
-        )
+        self._result.add_raw_data('raw_output_' + str(cmd_idx) + '_IB' + str(self._args.ib_index), raw_output)
 
         valid = False
         content = raw_output.splitlines()
diff --git a/superbench/benchmarks/return_code.py b/superbench/benchmarks/return_code.py
index 0991ddb22..da207d01a 100644
--- a/superbench/benchmarks/return_code.py
+++ b/superbench/benchmarks/return_code.py
@@ -28,3 +28,4 @@ class ReturnCode(Enum):
     MICROBENCHMARK_EXECUTION_FAILURE = 32
     MICROBENCHMARK_RESULT_PARSING_FAILURE = 33
     MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE = 34
+    MICROBENCHMARK_DEVICE_GETTING_FAILURE = 35
diff --git a/superbench/common/utils/network.py b/superbench/common/utils/network.py
index 4e96f24b4..0ea767491 100644
--- a/superbench/common/utils/network.py
+++ b/superbench/common/utils/network.py
@@ -4,6 +4,7 @@
 """Network Utility."""
 
 import socket
+import subprocess
 from contextlib import closing
 
 
@@ -17,3 +18,18 @@ def get_free_port():
         s.bind(('', 0))
         s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
         return s.getsockname()[1]
+
+
+def get_ib_devices():
+    """Get available ordered IB devices in the system and filter ethernet devices."""
+    # command = 'ls -l /sys/class/infiniband/* | awk \'{print $9}\' | sort | awk -F\'/\' \'{print $5}\''
+    command = "ibv_devinfo | awk '$1 ~ /hca_id/||/link_layer:/ {print $1,$2}' |  awk '{print $2}'"
+    output = subprocess.run(
+        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+    )
+    lines = output.stdout.splitlines()
+    ib_devices = []
+    for i in range(len(lines) - 1):
+        if 'InfiniBand' in lines[i + 1]:
+            ib_devices.append(lines[i])
+    return ib_devices
diff --git a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
index e69de29bb..4ac69c02b 100644
--- a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
@@ -0,0 +1,161 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for rdma-loopback benchmark."""
+
+import os
+import numbers
+import unittest
+from pathlib import Path
+
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType
+from superbench.common.utils import network
+
+
+class RDMALoopbackTest(unittest.TestCase):
+    """Tests for RDMALoopback benchmark."""
+    def create_fake_bin(self):
+        """Method called to prepare the test fixture."""
+        # Create fake binary file just for testing.
+        os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
+        binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
+        Path(binary_path).mkdir(parents=True, exist_ok=True)
+        self.__binary_file = Path(os.path.join(binary_path, 'run_perftest_loopback'))
+        self.__binary_file.touch(mode=0o755, exist_ok=True)
+
+    def test_rdma_loopback_performance(self):
+        """Test rdma-loopback benchmark."""
+        # Condition without RDMA devices
+        if (len(network.get_ib_devices()) < 1):
+            # Test for registry.
+            benchmark_name = 'rdma-loopback'
+            (benchmark_class,
+             predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+            assert (benchmark_class)
+
+            # Test for preprocess
+            parameters = '--ib_index 0 --numa 0 --n 2000'
+            benchmark = benchmark_class(benchmark_name, parameters=parameters)
+            self.create_fake_bin()
+            ret = benchmark._preprocess()
+            assert (ret is False)
+            assert (benchmark.return_code is ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
+
+            raw_output_AF = """
+    ************************************
+    * Waiting for client to connect... *
+    ************************************
+    ---------------------------------------------------------------------------------------
+                        RDMA_Write BW Test
+    Dual-port       : OFF          Device         : ibP257p0s0
+    Number of qps   : 1            Transport type : IB
+    Connection type : RC           Using SRQ      : OFF
+    PCIe relax order: ON
+    ---------------------------------------------------------------------------------------
+                        RDMA_Write BW Test
+    Dual-port       : OFF          Device         : ibP257p0s0
+    Number of qps   : 1            Transport type : IB
+    Connection type : RC           Using SRQ      : OFF
+    PCIe relax order: ON
+    ibv_wr* API     : ON
+    TX depth        : 128
+    CQ Moderation   : 100
+    Mtu             : 4096[B]
+    Link type       : IB
+    Max inline data : 0[B]
+    rdma_cm QPs     : OFF
+    Data ex. method : Ethernet
+    ---------------------------------------------------------------------------------------
+    ibv_wr* API     : ON
+    CQ Moderation   : 100
+    Mtu             : 4096[B]
+    Link type       : IB
+    Max inline data : 0[B]
+    rdma_cm QPs     : OFF
+    Data ex. method : Ethernet
+    ---------------------------------------------------------------------------------------
+    local address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+    local address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+    remote address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+    remote address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+    ---------------------------------------------------------------------------------------
+    ---------------------------------------------------------------------------------------
+    #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+    #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+    2          2000             5.32               5.30               2.778732
+    4          2000             10.65              10.64              2.788833
+    8          2000             21.30              21.27              2.787609
+    16         2000             42.60              42.55              2.788268
+    32         2000             84.90              82.82              2.713896
+    64         2000             173.55             171.66             2.812504
+    128        2000             362.27             353.83             2.898535
+    256        2000             687.82             679.37             2.782698
+    512        2000             1337.12            1311.59            2.686135
+    1024       2000             2674.25            2649.39            2.712980
+    2048       2000             5248.56            5118.18            2.620509
+    4096       2000             10034.02            9948.41                   2.546793
+    8192       2000             18620.51            12782.56                  1.636168
+    16384      2000             23115.27            16782.50                  1.074080
+    32768      2000             22927.94            18586.03                  0.594753
+    65536      2000             23330.56            21167.79                  0.338685
+    131072     2000             22750.35            21443.14                  0.171545
+    262144     2000             22673.63            22411.35                  0.089645
+    524288     2000             22679.02            22678.86                  0.045358
+    1048576    2000             22817.06            22816.86                  0.022817
+    2097152    2000             22919.37            22919.27                  0.011460
+    4194304    2000             23277.93            23277.91                  0.005819
+    8388608    2000             23240.68            23240.68                  0.002905
+    ---------------------------------------------------------------------------------------
+    8388608    2000             23240.68            23240.68                  0.002905
+    ---------------------------------------------------------------------------------------
+    """
+            assert (benchmark._process_raw_result(0, raw_output_AF))
+
+            self.__binary_file.unlink()
+
+        # Condition with RDMA devices
+        else:
+            # Test for registry, preprocess and run.
+            parameters = '--ib_index 0 --numa 0 --n 2000'
+            context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
+
+            assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+            benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+            # Check results and metrics.
+            assert (benchmark.run_count == 1)
+            assert (benchmark.return_code == ReturnCode.SUCCESS)
+            assert ('raw_output_0_IB0' in benchmark.raw_data)
+            assert (len(benchmark.raw_data['raw_output_0_IB0']) == 1)
+            assert (isinstance(benchmark.raw_data['raw_output_0_IB0'][0], str))
+
+        # Test for process_raw_data.
+        # Positive case - valid raw output.
+        metric_list = []
+        message_sizes = ['8388608', '4194304', '2097152', '1048576']
+        for ib_command in benchmark._args.commands:
+            for size in message_sizes:
+                metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
+                    str(benchmark._args.ib_index), benchmark._args.mode, size, str(benchmark._args.n), ib_command
+                )
+                metric_list.append(metric)
+        for metric in metric_list:
+            assert (metric in benchmark.result)
+            assert (len(benchmark.result[metric]) == 1)
+            assert (isinstance(benchmark.result[metric][0], numbers.Number))
+
+        # Negative case - Add invalid raw output.
+        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
+
+        # Check basic information.
+        assert (benchmark.name == 'rdma-loopback')
+        assert (benchmark.type == BenchmarkType.MICRO)
+        assert (benchmark._bin_name == 'run_perftest_loopback')
+
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.ib_index == 0)
+        assert (benchmark._args.numa == 0)
+        assert (benchmark._args.n == 2000)
+        assert (benchmark._args.size == 8388608)
+        assert (benchmark._args.commands == ['ib_write_bw'])
+        assert (benchmark._args.mode == 'AF')

From cafcc3fffc62bd07a8cfb828a0876b733e6d58c1 Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Tue, 29 Jun 2021 19:40:27 +0800
Subject: [PATCH 03/10] revise get_ib_devices to consider multi port condition

---
 superbench/common/utils/network.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/superbench/common/utils/network.py b/superbench/common/utils/network.py
index 0ea767491..f0b03695e 100644
--- a/superbench/common/utils/network.py
+++ b/superbench/common/utils/network.py
@@ -22,7 +22,6 @@ def get_free_port():
 
 def get_ib_devices():
     """Get available ordered IB devices in the system and filter ethernet devices."""
-    # command = 'ls -l /sys/class/infiniband/* | awk \'{print $9}\' | sort | awk -F\'/\' \'{print $5}\''
     command = "ibv_devinfo | awk '$1 ~ /hca_id/||/link_layer:/ {print $1,$2}' |  awk '{print $2}'"
     output = subprocess.run(
         command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
@@ -30,6 +29,6 @@ def get_ib_devices():
     lines = output.stdout.splitlines()
     ib_devices = []
     for i in range(len(lines) - 1):
-        if 'InfiniBand' in lines[i + 1]:
+        if 'InfiniBand' in lines[i + 1] and 'InfiniBand' not in lines[i]:
             ib_devices.append(lines[i])
     return ib_devices

From befeb3ebc4d01fe80ed28801e6db761f4b16c881 Mon Sep 17 00:00:00 2001
From: root
 <root@sb-validation-scus-000001.vwsxkqe1df2evmz1vbpvfbm2zg.jx.internal.cloudapp.net>
Date: Wed, 30 Jun 2021 09:06:47 +0000
Subject: [PATCH 04/10] add support for runner to parallel running

---
 .../benchmarks/rdma_loopback_performance.py   |   7 +-
 .../rdma_loopback_performance.py              |  31 +-
 superbench/config/default.yaml                |  11 +
 .../test_rdma_loopback_performance.py         | 317 ++++++++++--------
 4 files changed, 221 insertions(+), 145 deletions(-)

diff --git a/examples/benchmarks/rdma_loopback_performance.py b/examples/benchmarks/rdma_loopback_performance.py
index e001b00b3..014089a8f 100644
--- a/examples/benchmarks/rdma_loopback_performance.py
+++ b/examples/benchmarks/rdma_loopback_performance.py
@@ -1,18 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-"""Model benchmark example for RDMA loopback performance.
+"""Micro benchmark example for RDMA loopback performance.
 
 Commands to run:
-  python3 examples/benchmarks/rdma_loopback_performance_performance.py
+  python examples/benchmarks/rdma_loopback_performance_performance.py
 """
 
 from superbench.benchmarks import BenchmarkRegistry
 from superbench.common.utils import logger
 
 if __name__ == '__main__':
-    parameters = '--ib_index 0 --numa 1'
-    context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
+    context = BenchmarkRegistry.create_benchmark_context('rdma-loopback')
 
     benchmark = BenchmarkRegistry.launch_benchmark(context)
     if benchmark:
diff --git a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
index 106546a31..a3d198c1a 100644
--- a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
@@ -35,7 +35,7 @@ def add_parser_arguments(self):
             '--ib_index',
             type=int,
             default=0,
-            required=True,
+            required=False,
             help='The index of ib device.',
         )
         self._parser.add_argument(
@@ -69,7 +69,7 @@ def add_parser_arguments(self):
             '--numa',
             type=int,
             default=0,
-            required=True,
+            required=False,
             help='The index of numa node.',
         )
 
@@ -88,6 +88,19 @@ def __get_numa_cores(self, numa_index):
         )
         return output.stdout.splitlines()[0].split(' ')
 
+    def __get_arguments_from_env(self):
+        """Read environment variables from runner used for parallel and fill in ib_index and numa_node_index.
+
+        Get 'PROC_RANK'(rank of current process) 'IB_DEVICES' 'NUMA_NODES' environment variables
+        Get ib_index and numa_node_index according to 'NUMA_NODES'['PROC_RANK'] and 'IB_DEVICES'['PROC_RANK']
+        """
+        if os.getenv('PROC_RANK'):
+            rank = int(os.getenv('PROC_RANK'))
+            if os.getenv('IB_DEVICES'):
+                self._args.ib_index = int(os.getenv('IB_DEVICES').split(',')[rank])
+            if os.getenv('NUMA_NODES'):
+                self._args.numa = int(os.getenv('NUMA_NODES').split(',')[rank])
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -97,12 +110,15 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
+        self.__get_arguments_from_env()
+
+        # Format the arguments
         if not isinstance(self._args.commands, list):
             self._args.commands = [self._args.commands]
         self._args.commands = [command.lower() for command in self._args.commands]
-
         self._args.mode = self._args.mode.upper()
 
+        # Check whether arguments are valid
         if str(self._args.size) not in self.__message_sizes:
             self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
             logger.error(
@@ -111,12 +127,11 @@ def _preprocess(self):
                 )
             )
             return False
-
         command_mode = ''
         if self._args.mode == 'AF':
             command_mode = ' -a'
         elif self._args.mode == 'S':
-            command_mode = ' -s ' + self._args.size
+            command_mode = ' -s ' + str(self._args.size)
         else:
             self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
             logger.error(
@@ -148,9 +163,9 @@ def _preprocess(self):
                     command += ' -d ' + network.get_ib_devices()[self._args.ib_index]
                     command += ' -p ' + str(network.get_free_port())
                     self._commands.append(command)
-                except BaseException:
+                except BaseException as e:
                     self._result.set_return_code(ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
-                    logger.error('Getting devices failure - benchmark: {}.'.format(self._name))
+                    logger.error('Getting devices failure - benchmark: {}, message: {}.'.format(self._name, str(e)))
                     return False
         return True
 
@@ -200,4 +215,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
         return True
 
 
-BenchmarkRegistry.register_benchmark('rdma-loopback', RDMALoopback, parameters='--ib_index 0 --numa 1')
+BenchmarkRegistry.register_benchmark('rdma-loopback', RDMALoopback)
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 491cad2f2..5e43dd485 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -27,6 +27,17 @@ superbench:
       model_action:
         - train
   benchmarks:
+    rdma-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
+          parallel: yes
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
+          parallel: yes
     kernel-launch:
       <<: *default_local_mode
     gemm-flops:
diff --git a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
index 4ac69c02b..dbd6aba4f 100644
--- a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
@@ -25,137 +25,188 @@ def create_fake_bin(self):
 
     def test_rdma_loopback_performance(self):
         """Test rdma-loopback benchmark."""
-        # Condition without RDMA devices
-        if (len(network.get_ib_devices()) < 1):
-            # Test for registry.
-            benchmark_name = 'rdma-loopback'
-            (benchmark_class,
-             predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
-            assert (benchmark_class)
-
-            # Test for preprocess
-            parameters = '--ib_index 0 --numa 0 --n 2000'
-            benchmark = benchmark_class(benchmark_name, parameters=parameters)
-            self.create_fake_bin()
-            ret = benchmark._preprocess()
-            assert (ret is False)
-            assert (benchmark.return_code is ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
-
-            raw_output_AF = """
-    ************************************
-    * Waiting for client to connect... *
-    ************************************
-    ---------------------------------------------------------------------------------------
-                        RDMA_Write BW Test
-    Dual-port       : OFF          Device         : ibP257p0s0
-    Number of qps   : 1            Transport type : IB
-    Connection type : RC           Using SRQ      : OFF
-    PCIe relax order: ON
-    ---------------------------------------------------------------------------------------
-                        RDMA_Write BW Test
-    Dual-port       : OFF          Device         : ibP257p0s0
-    Number of qps   : 1            Transport type : IB
-    Connection type : RC           Using SRQ      : OFF
-    PCIe relax order: ON
-    ibv_wr* API     : ON
-    TX depth        : 128
-    CQ Moderation   : 100
-    Mtu             : 4096[B]
-    Link type       : IB
-    Max inline data : 0[B]
-    rdma_cm QPs     : OFF
-    Data ex. method : Ethernet
-    ---------------------------------------------------------------------------------------
-    ibv_wr* API     : ON
-    CQ Moderation   : 100
-    Mtu             : 4096[B]
-    Link type       : IB
-    Max inline data : 0[B]
-    rdma_cm QPs     : OFF
-    Data ex. method : Ethernet
-    ---------------------------------------------------------------------------------------
-    local address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
-    local address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
-    remote address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
-    remote address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
-    ---------------------------------------------------------------------------------------
-    ---------------------------------------------------------------------------------------
-    #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
-    #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
-    2          2000             5.32               5.30               2.778732
-    4          2000             10.65              10.64              2.788833
-    8          2000             21.30              21.27              2.787609
-    16         2000             42.60              42.55              2.788268
-    32         2000             84.90              82.82              2.713896
-    64         2000             173.55             171.66             2.812504
-    128        2000             362.27             353.83             2.898535
-    256        2000             687.82             679.37             2.782698
-    512        2000             1337.12            1311.59            2.686135
-    1024       2000             2674.25            2649.39            2.712980
-    2048       2000             5248.56            5118.18            2.620509
-    4096       2000             10034.02            9948.41                   2.546793
-    8192       2000             18620.51            12782.56                  1.636168
-    16384      2000             23115.27            16782.50                  1.074080
-    32768      2000             22927.94            18586.03                  0.594753
-    65536      2000             23330.56            21167.79                  0.338685
-    131072     2000             22750.35            21443.14                  0.171545
-    262144     2000             22673.63            22411.35                  0.089645
-    524288     2000             22679.02            22678.86                  0.045358
-    1048576    2000             22817.06            22816.86                  0.022817
-    2097152    2000             22919.37            22919.27                  0.011460
-    4194304    2000             23277.93            23277.91                  0.005819
-    8388608    2000             23240.68            23240.68                  0.002905
-    ---------------------------------------------------------------------------------------
-    8388608    2000             23240.68            23240.68                  0.002905
-    ---------------------------------------------------------------------------------------
+        raw_output = {}
+        raw_output['AF'] = """
+************************************
+* Waiting for client to connect... *
+************************************
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+Dual-port       : OFF          Device         : ibP257p0s0
+Number of qps   : 1            Transport type : IB
+Connection type : RC           Using SRQ      : OFF
+PCIe relax order: ON
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+Dual-port       : OFF          Device         : ibP257p0s0
+Number of qps   : 1            Transport type : IB
+Connection type : RC           Using SRQ      : OFF
+PCIe relax order: ON
+ibv_wr* API     : ON
+TX depth        : 128
+CQ Moderation   : 100
+Mtu             : 4096[B]
+Link type       : IB
+Max inline data : 0[B]
+rdma_cm QPs     : OFF
+Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ibv_wr* API     : ON
+CQ Moderation   : 100
+Mtu             : 4096[B]
+Link type       : IB
+Max inline data : 0[B]
+rdma_cm QPs     : OFF
+Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+local address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+local address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+remote address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+remote address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+---------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------
+#bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+#bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+2          2000             5.32               5.30               2.778732
+4          2000             10.65              10.64              2.788833
+8          2000             21.30              21.27              2.787609
+16         2000             42.60              42.55              2.788268
+32         2000             84.90              82.82              2.713896
+64         2000             173.55             171.66             2.812504
+128        2000             362.27             353.83             2.898535
+256        2000             687.82             679.37             2.782698
+512        2000             1337.12            1311.59            2.686135
+1024       2000             2674.25            2649.39            2.712980
+2048       2000             5248.56            5118.18            2.620509
+4096       2000             10034.02            9948.41                   2.546793
+8192       2000             18620.51            12782.56                  1.636168
+16384      2000             23115.27            16782.50                  1.074080
+32768      2000             22927.94            18586.03                  0.594753
+65536      2000             23330.56            21167.79                  0.338685
+131072     2000             22750.35            21443.14                  0.171545
+262144     2000             22673.63            22411.35                  0.089645
+524288     2000             22679.02            22678.86                  0.045358
+1048576    2000             22817.06            22816.86                  0.022817
+2097152    2000             22919.37            22919.27                  0.011460
+4194304    2000             23277.93            23277.91                  0.005819
+8388608    2000             23240.68            23240.68                  0.002905
+---------------------------------------------------------------------------------------
+8388608    2000             23240.68            23240.68                  0.002905
+---------------------------------------------------------------------------------------
     """
-            assert (benchmark._process_raw_result(0, raw_output_AF))
-
-            self.__binary_file.unlink()
-
-        # Condition with RDMA devices
-        else:
-            # Test for registry, preprocess and run.
-            parameters = '--ib_index 0 --numa 0 --n 2000'
-            context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
-
-            assert (BenchmarkRegistry.is_benchmark_context_valid(context))
-            benchmark = BenchmarkRegistry.launch_benchmark(context)
-
-            # Check results and metrics.
-            assert (benchmark.run_count == 1)
-            assert (benchmark.return_code == ReturnCode.SUCCESS)
-            assert ('raw_output_0_IB0' in benchmark.raw_data)
-            assert (len(benchmark.raw_data['raw_output_0_IB0']) == 1)
-            assert (isinstance(benchmark.raw_data['raw_output_0_IB0'][0], str))
-
-        # Test for process_raw_data.
-        # Positive case - valid raw output.
-        metric_list = []
-        message_sizes = ['8388608', '4194304', '2097152', '1048576']
-        for ib_command in benchmark._args.commands:
-            for size in message_sizes:
-                metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
-                    str(benchmark._args.ib_index), benchmark._args.mode, size, str(benchmark._args.n), ib_command
-                )
-                metric_list.append(metric)
-        for metric in metric_list:
-            assert (metric in benchmark.result)
-            assert (len(benchmark.result[metric]) == 1)
-            assert (isinstance(benchmark.result[metric][0], numbers.Number))
-
-        # Negative case - Add invalid raw output.
-        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
-
-        # Check basic information.
-        assert (benchmark.name == 'rdma-loopback')
-        assert (benchmark.type == BenchmarkType.MICRO)
-        assert (benchmark._bin_name == 'run_perftest_loopback')
-
-        # Check parameters specified in BenchmarkContext.
-        assert (benchmark._args.ib_index == 0)
-        assert (benchmark._args.numa == 0)
-        assert (benchmark._args.n == 2000)
-        assert (benchmark._args.size == 8388608)
-        assert (benchmark._args.commands == ['ib_write_bw'])
-        assert (benchmark._args.mode == 'AF')
+        raw_output['S'] = """
+                        RDMA_Write BW Test
+ Dual-port       : OFF		Device         : ibP257p0s0
+ Number of qps   : 1		Transport type : IB
+ Connection type : RC		Using SRQ      : OFF
+ PCIe relax order: ON
+ TX depth        : 128
+ CQ Moderation   : 1
+ Mtu             : 4096[B]
+ Link type       : IB
+ Max inline data : 0[B]
+ rdma_cm QPs	 : OFF
+ Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ local address: LID 0xd06 QPN 0x095f PSN 0x3c9e82 RKey 0x080359 VAddr 0x007f9fc479c000
+ remote address: LID 0xd06 QPN 0x095e PSN 0xbd024b RKey 0x080258 VAddr 0x007fe62504b000
+---------------------------------------------------------------------------------------
+ #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+ 8388608    20000            24056.74            24056.72		   0.003007
+************************************
+* Waiting for client to connect... *
+************************************
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+ Dual-port       : OFF		Device         : ibP257p0s0
+ Number of qps   : 1		Transport type : IB
+ Connection type : RC		Using SRQ      : OFF
+ PCIe relax order: ON
+ CQ Moderation   : 1
+ Mtu             : 4096[B]
+ Link type       : IB
+ Max inline data : 0[B]
+ rdma_cm QPs	 : OFF
+ Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ local address: LID 0xd06 QPN 0x095e PSN 0xbd024b RKey 0x080258 VAddr 0x007fe62504b000
+ remote address: LID 0xd06 QPN 0x095f PSN 0x3c9e82 RKey 0x080359 VAddr 0x007f9fc479c000
+---------------------------------------------------------------------------------------
+ #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+ 8388608    20000            24056.74            24056.72		   0.003007
+---------------------------------------------------------------------------------------
+
+---------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------
+"""
+        for mode in ['AF', 'S']:
+            # Test without RDMA devices
+            if (len(network.get_ib_devices()) < 1):
+                self.create_fake_bin()
+                # Check registry.
+                benchmark_name = 'rdma-loopback'
+                (benchmark_class, predefine_params
+                 ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+                assert (benchmark_class)
+
+                # Check preprocess
+                parameters = '--ib_index 0 --numa 0 --n 2000 --mode ' + mode
+                benchmark = benchmark_class(benchmark_name, parameters=parameters)
+                ret = benchmark._preprocess()
+                assert (ret is False)
+                assert (benchmark.return_code is ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
+
+                assert (benchmark._process_raw_result(0, raw_output[mode]))
+
+                self.__binary_file.unlink()
+
+            # Test with RDMA devices
+            else:
+                # Check registry, preprocess and run.
+                parameters = '--ib_index 0 --numa 0 --n 2000 --mode ' + mode
+                context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
+
+                assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+                benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+                # Check raw_data.
+                assert (benchmark.run_count == 1)
+                assert (benchmark.return_code == ReturnCode.SUCCESS)
+                assert ('raw_output_0_IB0' in benchmark.raw_data)
+                assert (len(benchmark.raw_data['raw_output_0_IB0']) == 1)
+                assert (isinstance(benchmark.raw_data['raw_output_0_IB0'][0], str))
+
+            # Check function process_raw_data.
+            # Positive case - valid raw output.
+            metric_list = []
+            message_sizes = []
+            if mode == 'AF':
+                message_sizes = ['8388608', '4194304', '2097152', '1048576']
+            elif mode == 'S':
+                message_sizes = [benchmark._args.size]
+            for ib_command in benchmark._args.commands:
+                for size in message_sizes:
+                    metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
+                        str(benchmark._args.ib_index), benchmark._args.mode, size, str(benchmark._args.n), ib_command
+                    )
+                    metric_list.append(metric)
+            for metric in metric_list:
+                assert (metric in benchmark.result)
+                assert (len(benchmark.result[metric]) == 1)
+                assert (isinstance(benchmark.result[metric][0], numbers.Number))
+
+            # Negative case - Add invalid raw output.
+            assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
+
+            # Check basic information.
+            assert (benchmark.name == 'rdma-loopback')
+            assert (benchmark.type == BenchmarkType.MICRO)
+            assert (benchmark._bin_name == 'run_perftest_loopback')
+
+            # Check parameters specified in BenchmarkContext.
+            assert (benchmark._args.ib_index == 0)
+            assert (benchmark._args.numa == 0)
+            assert (benchmark._args.n == 2000)
+            assert (benchmark._args.size == 8388608)
+            assert (benchmark._args.commands == ['ib_write_bw'])
+            assert (benchmark._args.mode == mode)

From d1480d92a09e4176db1eb344f446306732ec03f5 Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Thu, 1 Jul 2021 09:08:59 +0800
Subject: [PATCH 05/10] revise test a little bit

---
 .../test_rdma_loopback_performance.py         | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
index dbd6aba4f..8314192af 100644
--- a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
@@ -14,14 +14,20 @@
 
 class RDMALoopbackTest(unittest.TestCase):
     """Tests for RDMALoopback benchmark."""
-    def create_fake_bin(self):
+    def setUp(self):
         """Method called to prepare the test fixture."""
-        # Create fake binary file just for testing.
-        os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
-        binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
-        Path(binary_path).mkdir(parents=True, exist_ok=True)
-        self.__binary_file = Path(os.path.join(binary_path, 'run_perftest_loopback'))
-        self.__binary_file.touch(mode=0o755, exist_ok=True)
+        if (len(network.get_ib_devices()) < 1):
+            # Create fake binary file just for testing.
+            os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
+            binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
+            Path(binary_path).mkdir(parents=True, exist_ok=True)
+            self.__binary_file = Path(os.path.join(binary_path, 'run_perftest_loopback'))
+            self.__binary_file.touch(mode=0o755, exist_ok=True)
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        if (len(network.get_ib_devices()) < 1):
+            self.__binary_file.unlink()
 
     def test_rdma_loopback_performance(self):
         """Test rdma-loopback benchmark."""
@@ -142,7 +148,6 @@ def test_rdma_loopback_performance(self):
         for mode in ['AF', 'S']:
             # Test without RDMA devices
             if (len(network.get_ib_devices()) < 1):
-                self.create_fake_bin()
                 # Check registry.
                 benchmark_name = 'rdma-loopback'
                 (benchmark_class, predefine_params
@@ -158,8 +163,6 @@ def test_rdma_loopback_performance(self):
 
                 assert (benchmark._process_raw_result(0, raw_output[mode]))
 
-                self.__binary_file.unlink()
-
             # Test with RDMA devices
             else:
                 # Check registry, preprocess and run.

From 18b935181d8733495e29a43d2fd0ca858fd645f2 Mon Sep 17 00:00:00 2001
From: root
 <root@sb-validation-scus-000001.vwsxkqe1df2evmz1vbpvfbm2zg.jx.internal.cloudapp.net>
Date: Wed, 7 Jul 2021 07:37:17 +0000
Subject: [PATCH 06/10] add gitsubmodule for perftest

---
 .gitmodules          |  3 +++
 third_party/Makefile | 12 +++++++++---
 third_party/perftest |  1 +
 3 files changed, 13 insertions(+), 3 deletions(-)
 create mode 160000 third_party/perftest

diff --git a/.gitmodules b/.gitmodules
index 07f7559ce..4f3732b45 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,6 @@
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
 	branch = v2.4.0
+[submodule "third_party/perftest"]
+	path = third_party/perftest
+	url = https://github.com/linux-rdma/perftest.git
diff --git a/third_party/Makefile b/third_party/Makefile
index 64f3e6d8d..9839e0788 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -4,15 +4,21 @@
 
 SB_MICRO_PATH ?= "/usr/local"
 
-.PHONY: all cutlass
+.PHONY: all cutlass perftest
 
 # Build all targets.
-all: cutlass
+all: cutlass perftest
 
 # Build cutlass.
-cutlass:
+cutlass: 
 ifneq (,$(wildcard cutlass/CMakeLists.txt))
 	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \
 		-DCUTLASS_NVCC_ARCHS='70;80' -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
 	cmake --build ./cutlass/build -j 8 --target install
 endif
+# Build perftest.
+perftest: 
+ifneq (,$(wildcard perftest/autogen.sh))
+	cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
+endif
+
diff --git a/third_party/perftest b/third_party/perftest
new file mode 160000
index 000000000..7504ce48a
--- /dev/null
+++ b/third_party/perftest
@@ -0,0 +1 @@
+Subproject commit 7504ce48ac396a02f4d00de359257b2cb8458f06

From d688ed80cc0935a470f284da47a6776c72b97e2c Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Wed, 7 Jul 2021 09:34:54 +0000
Subject: [PATCH 07/10] remove util related code

---
 superbench/common/utils/__init__.py           |  2 +-
 superbench/common/utils/network.py            | 34 -------------------
 .../test_computation_communication_overlap.py |  3 +-
 .../micro_benchmarks/test_sharding_matmul.py  |  3 +-
 tests/benchmarks/utils.py                     | 17 ++++++++--
 5 files changed, 18 insertions(+), 41 deletions(-)
 delete mode 100644 superbench/common/utils/network.py

diff --git a/superbench/common/utils/__init__.py b/superbench/common/utils/__init__.py
index a339d41fd..708eed456 100644
--- a/superbench/common/utils/__init__.py
+++ b/superbench/common/utils/__init__.py
@@ -9,4 +9,4 @@
 
 nv_helper = LazyImport('superbench.common.utils.nvidia_helper')
 
-__all__ = ['SuperBenchLogger', 'logger', 'create_output_dir', 'get_sb_config', 'LazyImport', 'nv_helper', 'network']
+__all__ = ['SuperBenchLogger', 'logger', 'create_output_dir', 'get_sb_config', 'LazyImport', 'nv_helper']
diff --git a/superbench/common/utils/network.py b/superbench/common/utils/network.py
deleted file mode 100644
index f0b03695e..000000000
--- a/superbench/common/utils/network.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-"""Network Utility."""
-
-import socket
-import subprocess
-from contextlib import closing
-
-
-def get_free_port():
-    """Get a free port in current system.
-
-    Return:
-        port (int): a free port in current system.
-    """
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-        s.bind(('', 0))
-        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-        return s.getsockname()[1]
-
-
-def get_ib_devices():
-    """Get available ordered IB devices in the system and filter ethernet devices."""
-    command = "ibv_devinfo | awk '$1 ~ /hca_id/||/link_layer:/ {print $1,$2}' |  awk '{print $2}'"
-    output = subprocess.run(
-        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
-    )
-    lines = output.stdout.splitlines()
-    ib_devices = []
-    for i in range(len(lines) - 1):
-        if 'InfiniBand' in lines[i + 1] and 'InfiniBand' not in lines[i]:
-            ib_devices.append(lines[i])
-    return ib_devices
diff --git a/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py b/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
index 598bdab09..381dec3ac 100644
--- a/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
+++ b/tests/benchmarks/micro_benchmarks/test_computation_communication_overlap.py
@@ -7,7 +7,6 @@
 
 from tests.helper import decorator
 import tests.benchmarks.utils as utils
-from superbench.common.utils import network
 from superbench.benchmarks import BenchmarkRegistry, Framework, BenchmarkType, ReturnCode
 from superbench.benchmarks.micro_benchmarks.computation_communication_overlap \
     import ComputationCommunicationOverlap, ComputationKernelType
@@ -57,7 +56,7 @@ def test_pytorch_computation_communication_overlap_fake_distributed():
         parameters='--num_warmup 5 --num_steps 10 --ratio 5',
         framework=Framework.PYTORCH
     )
-    utils.setup_simulated_ddp_distributed_env(1, 0, network.get_free_port())
+    utils.setup_simulated_ddp_distributed_env(1, 0, utils.get_free_port())
     benchmark = BenchmarkRegistry.launch_benchmark(context)
 
     # Check basic information.
diff --git a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
index 03428d3b1..c828d7e18 100644
--- a/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
+++ b/tests/benchmarks/micro_benchmarks/test_sharding_matmul.py
@@ -5,7 +5,6 @@
 
 import tests.benchmarks.utils as utils
 from tests.helper import decorator
-from superbench.common.utils import network
 from superbench.benchmarks import BenchmarkRegistry, Platform, Framework, BenchmarkType, ReturnCode
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul, ShardingMode
 
@@ -23,7 +22,7 @@ def test_pytorch_sharding_matmul():
 
     assert (BenchmarkRegistry.is_benchmark_context_valid(context))
 
-    utils.setup_simulated_ddp_distributed_env(1, 0, network.get_free_port())
+    utils.setup_simulated_ddp_distributed_env(1, 0, utils.get_free_port())
     benchmark = BenchmarkRegistry.launch_benchmark(context)
 
     # Check basic information.
diff --git a/tests/benchmarks/utils.py b/tests/benchmarks/utils.py
index 87c46b1d8..3ef897c5e 100644
--- a/tests/benchmarks/utils.py
+++ b/tests/benchmarks/utils.py
@@ -4,11 +4,12 @@
 """Utilities for benchmark tests."""
 
 import os
+import socket
+from contextlib import closing
 import multiprocessing as multiprocessing
 from multiprocessing import Process
 
 from superbench.benchmarks import BenchmarkRegistry
-from superbench.common.utils import network
 
 
 def clean_simulated_ddp_distributed_env():
@@ -20,6 +21,18 @@ def clean_simulated_ddp_distributed_env():
     os.environ.pop('MASTER_PORT')
 
 
+def get_free_port():
+    """Get a free port in current system.
+
+    Return:
+        port (int): a free port in current system.
+    """
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(('', 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
 def setup_simulated_ddp_distributed_env(world_size, local_rank, port):
     """Function to setup the simulated DDP distributed envionment variables."""
     os.environ['WORLD_SIZE'] = str(world_size)
@@ -45,7 +58,7 @@ def simulated_ddp_distributed_benchmark(context, world_size):
     Return:
         results (list): list of benchmark results from #world_size number of processes.
     """
-    port = network.get_free_port()
+    port = get_free_port()
     process_list = []
     multiprocessing.set_start_method('spawn')
 

From ebf068a2d7e915d80020aa51339b3d57edcb45e4 Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Tue, 13 Jul 2021 16:23:36 +0800
Subject: [PATCH 08/10] rename from RDMA to IB

---
 ...formance.py => ib_loopback_performance.py} |  6 +++---
 .../benchmarks/micro_benchmarks/__init__.py   |  4 ++--
 ...formance.py => ib_loopback_performance.py} | 10 +++++-----
 superbench/config/default.yaml                |  2 +-
 ...nce.py => test_ib_loopback_performance.py} | 20 +++++++++----------
 5 files changed, 21 insertions(+), 21 deletions(-)
 rename examples/benchmarks/{rdma_loopback_performance.py => ib_loopback_performance.py} (70%)
 rename superbench/benchmarks/micro_benchmarks/{rdma_loopback_performance.py => ib_loopback_performance.py} (96%)
 rename tests/benchmarks/micro_benchmarks/{test_rdma_loopback_performance.py => test_ib_loopback_performance.py} (95%)

diff --git a/examples/benchmarks/rdma_loopback_performance.py b/examples/benchmarks/ib_loopback_performance.py
similarity index 70%
rename from examples/benchmarks/rdma_loopback_performance.py
rename to examples/benchmarks/ib_loopback_performance.py
index 014089a8f..0d3b8433b 100644
--- a/examples/benchmarks/rdma_loopback_performance.py
+++ b/examples/benchmarks/ib_loopback_performance.py
@@ -1,17 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-"""Micro benchmark example for RDMA loopback performance.
+"""Micro benchmark example for IB loopback performance.
 
 Commands to run:
-  python examples/benchmarks/rdma_loopback_performance_performance.py
+  python examples/benchmarks/ib_loopback_performance_performance.py
 """
 
 from superbench.benchmarks import BenchmarkRegistry
 from superbench.common.utils import logger
 
 if __name__ == '__main__':
-    context = BenchmarkRegistry.create_benchmark_context('rdma-loopback')
+    context = BenchmarkRegistry.create_benchmark_context('ib-loopback')
 
     benchmark = BenchmarkRegistry.launch_benchmark(context)
     if benchmark:
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 660eb5a9e..442d8e67d 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -10,9 +10,9 @@
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
-from superbench.benchmarks.micro_benchmarks.rdma_loopback_performance import RDMALoopback
+from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopback
 
 __all__ = [
     'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
-    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'RDMALoopback'
+    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'IBLoopback'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
similarity index 96%
rename from superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
rename to superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
index a3d198c1a..cc23f0582 100644
--- a/superbench/benchmarks/micro_benchmarks/rdma_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-"""Module of the RDMA loopback benchmarks."""
+"""Module of the IB loopback benchmarks."""
 
 import os
 import subprocess
@@ -12,8 +12,8 @@
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 
 
-class RDMALoopback(MicroBenchmarkWithInvoke):
-    """The RDMA loopback performance benchmark class."""
+class IBLoopback(MicroBenchmarkWithInvoke):
+    """The IB loopback performance benchmark class."""
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -193,7 +193,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     if self.__message_sizes[i] in line:
                         values = list(filter(None, line.split(' ')))
                         avg_bw = float(values[-2])
-                        metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
+                        metric = 'IB_{}_{}_{}_{}_{}_avg'.format(
                             str(self._args.ib_index), self._args.mode, self.__message_sizes[i], str(self._args.n),
                             ib_command
                         )
@@ -215,4 +215,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
         return True
 
 
-BenchmarkRegistry.register_benchmark('rdma-loopback', RDMALoopback)
+BenchmarkRegistry.register_benchmark('ib-loopback', IBLoopback)
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 5e43dd485..be6ce4777 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -27,7 +27,7 @@ superbench:
       model_action:
         - train
   benchmarks:
-    rdma-loopback:
+    ib-loopback:
       enable: true
       modes:
         - name: local
diff --git a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
similarity index 95%
rename from tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
rename to tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
index 8314192af..37882327d 100644
--- a/tests/benchmarks/micro_benchmarks/test_rdma_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-"""Tests for rdma-loopback benchmark."""
+"""Tests for ib-loopback benchmark."""
 
 import os
 import numbers
@@ -12,8 +12,8 @@
 from superbench.common.utils import network
 
 
-class RDMALoopbackTest(unittest.TestCase):
-    """Tests for RDMALoopback benchmark."""
+class IBLoopbackTest(unittest.TestCase):
+    """Tests for IBLoopback benchmark."""
     def setUp(self):
         """Method called to prepare the test fixture."""
         if (len(network.get_ib_devices()) < 1):
@@ -29,8 +29,8 @@ def tearDown(self):
         if (len(network.get_ib_devices()) < 1):
             self.__binary_file.unlink()
 
-    def test_rdma_loopback_performance(self):
-        """Test rdma-loopback benchmark."""
+    def test_ib_loopback_performance(self):
+        """Test ib-loopback benchmark."""
         raw_output = {}
         raw_output['AF'] = """
 ************************************
@@ -146,10 +146,10 @@ def test_rdma_loopback_performance(self):
 ---------------------------------------------------------------------------------------
 """
         for mode in ['AF', 'S']:
-            # Test without RDMA devices
+            # Test without ib devices
             if (len(network.get_ib_devices()) < 1):
                 # Check registry.
-                benchmark_name = 'rdma-loopback'
+                benchmark_name = 'ib-loopback'
                 (benchmark_class, predefine_params
                  ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
                 assert (benchmark_class)
@@ -163,11 +163,11 @@ def test_rdma_loopback_performance(self):
 
                 assert (benchmark._process_raw_result(0, raw_output[mode]))
 
-            # Test with RDMA devices
+            # Test with ib devices
             else:
                 # Check registry, preprocess and run.
                 parameters = '--ib_index 0 --numa 0 --n 2000 --mode ' + mode
-                context = BenchmarkRegistry.create_benchmark_context('rdma-loopback', parameters=parameters)
+                context = BenchmarkRegistry.create_benchmark_context('ib-loopback', parameters=parameters)
 
                 assert (BenchmarkRegistry.is_benchmark_context_valid(context))
                 benchmark = BenchmarkRegistry.launch_benchmark(context)
@@ -202,7 +202,7 @@ def test_rdma_loopback_performance(self):
             assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
 
             # Check basic information.
-            assert (benchmark.name == 'rdma-loopback')
+            assert (benchmark.name == 'ib-loopback')
             assert (benchmark.type == BenchmarkType.MICRO)
             assert (benchmark._bin_name == 'run_perftest_loopback')
 

From 16e7c172b3dda5941a3e737f403e6cf4a6d853ad Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Tue, 13 Jul 2021 17:50:52 +0800
Subject: [PATCH 09/10] fix test issue and rename metric

---
 .../benchmarks/micro_benchmarks/ib_loopback_performance.py   | 5 ++---
 .../micro_benchmarks/test_ib_loopback_performance.py         | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
index cc23f0582..5e0288c90 100644
--- a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
@@ -193,9 +193,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     if self.__message_sizes[i] in line:
                         values = list(filter(None, line.split(' ')))
                         avg_bw = float(values[-2])
-                        metric = 'IB_{}_{}_{}_{}_{}_avg'.format(
-                            str(self._args.ib_index), self._args.mode, self.__message_sizes[i], str(self._args.n),
-                            ib_command
+                        metric = 'IB_Avg_{}'.format(
+                            str(self._args.ib_index)
                         )
                         if metric not in metric_set:
                             metric_set.add(metric)
diff --git a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
index 37882327d..147422ce7 100644
--- a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -189,8 +189,8 @@ def test_ib_loopback_performance(self):
                 message_sizes = [benchmark._args.size]
             for ib_command in benchmark._args.commands:
                 for size in message_sizes:
-                    metric = 'RDMA_{}_{}_{}_{}_{}_avg'.format(
-                        str(benchmark._args.ib_index), benchmark._args.mode, size, str(benchmark._args.n), ib_command
+                    metric = 'IB_Avg_{}'.format(
+                        str(benchmark._args.ib_index)
                     )
                     metric_list.append(metric)
             for metric in metric_list:

From 31779b76a57cbdac53df93cd983ce95f8df8f85a Mon Sep 17 00:00:00 2001
From: yukirora <v-yujiang@microsoft.com>
Date: Tue, 13 Jul 2021 17:53:09 +0800
Subject: [PATCH 10/10] format and lint

---
 .gitmodules                                                  | 3 ---
 .../benchmarks/micro_benchmarks/ib_loopback_performance.py   | 5 +----
 .../micro_benchmarks/test_ib_loopback_performance.py         | 4 +---
 third_party/Makefile                                         | 1 -
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index f6d5d46c2..4f3732b45 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,9 +2,6 @@
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
 	branch = v2.4.0
-[submodule "third_party/cuda-samples"]
-	path = third_party/cuda-samples
-	url = https://github.com/NVIDIA/cuda-samples.git
 [submodule "third_party/perftest"]
 	path = third_party/perftest
 	url = https://github.com/linux-rdma/perftest.git
diff --git a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
index 5e0288c90..39fde4748 100644
--- a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
@@ -181,7 +181,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
         Return:
             True if the raw output string is valid and result can be extracted.
         """
-        ib_command = self._args.commands[cmd_idx]
         self._result.add_raw_data('raw_output_' + str(cmd_idx) + '_IB' + str(self._args.ib_index), raw_output)
 
         valid = False
@@ -193,9 +192,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     if self.__message_sizes[i] in line:
                         values = list(filter(None, line.split(' ')))
                         avg_bw = float(values[-2])
-                        metric = 'IB_Avg_{}'.format(
-                            str(self._args.ib_index)
-                        )
+                        metric = 'IB_Avg_{}'.format(str(self._args.ib_index))
                         if metric not in metric_set:
                             metric_set.add(metric)
                             self._result.add_result(metric, avg_bw)
diff --git a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
index 147422ce7..093f41140 100644
--- a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -189,9 +189,7 @@ def test_ib_loopback_performance(self):
                 message_sizes = [benchmark._args.size]
             for ib_command in benchmark._args.commands:
                 for size in message_sizes:
-                    metric = 'IB_Avg_{}'.format(
-                        str(benchmark._args.ib_index)
-                    )
+                    metric = 'IB_Avg_{}'.format(str(benchmark._args.ib_index))
                     metric_list.append(metric)
             for metric in metric_list:
                 assert (metric in benchmark.result)
diff --git a/third_party/Makefile b/third_party/Makefile
index 2282b400c..0a96030bc 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -34,6 +34,5 @@ endif
 # The version we use is v4.5-0.2, which is the latest release tag of perftest
 perftest: 
 ifneq (,$(wildcard perftest/autogen.sh))
-	cd perftest && git checkout v4.5-0.2
 	cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
 endif
\ No newline at end of file