diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index d8dcf5e8b..ae8e695e8 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -36,6 +36,7 @@ RUN apt-get update && \
     util-linux \
     vim \
     wget \
+    numactl \
     && \
     apt-get autoremove && \
     apt-get clean && \
diff --git a/examples/benchmarks/ib_loopback_performance.py b/examples/benchmarks/ib_loopback_performance.py
new file mode 100644
index 000000000..0d3b8433b
--- /dev/null
+++ b/examples/benchmarks/ib_loopback_performance.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for IB loopback performance.
+
+Commands to run:
+  python examples/benchmarks/ib_loopback_performance_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context('ib-loopback')
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 5f3b76ac8..a257cba36 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -11,8 +11,9 @@
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
 from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
+from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopback
 
 __all__ = [
     'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
-    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark'
+    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark', 'IBLoopback'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
new file mode 100644
index 000000000..39fde4748
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/ib_loopback_performance.py
@@ -0,0 +1,214 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the IB loopback benchmarks."""
+
+import os
+import subprocess
+
+from superbench.common.utils import logger
+from superbench.common.utils import network
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class IBLoopback(MicroBenchmarkWithInvoke):
+    """The IB loopback performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'run_perftest_loopback'
+        self.__support_ib_commands = ['ib_write_bw', 'ib_read_bw', 'ib_send_bw']
+        self.__message_sizes = ['8388608', '4194304', '2097152', '1048576']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--ib_index',
+            type=int,
+            default=0,
+            required=False,
+            help='The index of ib device.',
+        )
+        self._parser.add_argument(
+            '--n',
+            type=int,
+            default=20000,
+            required=False,
+            help='The iterations of running ib command',
+        )
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=8388608,
+            required=False,
+            help='The message size of running ib command. E.g. {}.'.format(' '.join(self.__message_sizes)),
+        )
+        self._parser.add_argument(
+            '--commands',
+            type=str,
+            nargs='+',
+            default='ib_write_bw',
+            help='The ib command used to run. E.g. {}.'.format(' '.join(self.__support_ib_commands)),
+        )
+        self._parser.add_argument(
+            '--mode',
+            type=str,
+            default='AF',
+            help='The mode used to run ib command. Eg, AF(all message size) or S(single message size)',
+        )
+        self._parser.add_argument(
+            '--numa',
+            type=int,
+            default=0,
+            required=False,
+            help='The index of numa node.',
+        )
+
+    def __get_numa_cores(self, numa_index):
+        """Get the last two cores from different physical cpu core of NUMA<numa_index>.
+
+        Args:
+            numa_index (int): the index of numa node.
+
+        Return:
+            The last two cores from different physical cpu core of NUMA<numa_index>.
+        """
+        command = 'numactl --hardware | grep "node {} cpus:"'.format(numa_index)
+        output = subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+        )
+        return output.stdout.splitlines()[0].split(' ')
+
+    def __get_arguments_from_env(self):
+        """Read environment variables from runner used for parallel and fill in ib_index and numa_node_index.
+
+        Get 'PROC_RANK'(rank of current process) 'IB_DEVICES' 'NUMA_NODES' environment variables
+        Get ib_index and numa_node_index according to 'NUMA_NODES'['PROC_RANK'] and 'IB_DEVICES'['PROC_RANK']
+        """
+        if os.getenv('PROC_RANK'):
+            rank = int(os.getenv('PROC_RANK'))
+            if os.getenv('IB_DEVICES'):
+                self._args.ib_index = int(os.getenv('IB_DEVICES').split(',')[rank])
+            if os.getenv('NUMA_NODES'):
+                self._args.numa = int(os.getenv('NUMA_NODES').split(',')[rank])
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        self.__get_arguments_from_env()
+
+        # Format the arguments
+        if not isinstance(self._args.commands, list):
+            self._args.commands = [self._args.commands]
+        self._args.commands = [command.lower() for command in self._args.commands]
+        self._args.mode = self._args.mode.upper()
+
+        # Check whether arguments are valid
+        if str(self._args.size) not in self.__message_sizes:
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            logger.error(
+                'Unsupported message size - benchmark: {}, size: {}, expect: {}.'.format(
+                    self._name, self._args.size, self.__message_sizes
+                )
+            )
+            return False
+        command_mode = ''
+        if self._args.mode == 'AF':
+            command_mode = ' -a'
+        elif self._args.mode == 'S':
+            command_mode = ' -s ' + str(self._args.size)
+        else:
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            logger.error(
+                'Unsupported args mode - benchmark: {}, mode: {}, expect: {}.'.format(
+                    self._name, self._args.mode, 'AF or S'
+                )
+            )
+            return False
+
+        for ib_command in self._args.commands:
+            if ib_command not in self.__support_ib_commands:
+                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                logger.error(
+                    'Unsupported ib command - benchmark: {}, command: {}, expected: {}.'.format(
+                        self._name, ib_command, self.__support_ib_commands
+                    )
+                )
+                return False
+            else:
+                try:
+                    command = os.path.join(self._args.bin_dir, self._bin_name)
+                    numa_cores = self.__get_numa_cores(self._args.numa)
+                    server_core = int(numa_cores[-1])
+                    client_core = int(numa_cores[-3])
+                    command += ' ' + str(server_core) + ' ' + str(client_core)
+                    command += ' ' + ib_command
+                    command += command_mode + ' -F'
+                    command += ' --iters=' + str(self._args.n)
+                    command += ' -d ' + network.get_ib_devices()[self._args.ib_index]
+                    command += ' -p ' + str(network.get_free_port())
+                    self._commands.append(command)
+                except BaseException as e:
+                    self._result.set_return_code(ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
+                    logger.error('Getting devices failure - benchmark: {}, message: {}.'.format(self._name, str(e)))
+                    return False
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx) + '_IB' + str(self._args.ib_index), raw_output)
+
+        valid = False
+        content = raw_output.splitlines()
+        try:
+            metric_set = set()
+            for line in content:
+                for i in range(len(self.__message_sizes)):
+                    if self.__message_sizes[i] in line:
+                        values = list(filter(None, line.split(' ')))
+                        avg_bw = float(values[-2])
+                        metric = 'IB_Avg_{}'.format(str(self._args.ib_index))
+                        if metric not in metric_set:
+                            metric_set.add(metric)
+                            self._result.add_result(metric, avg_bw)
+                            valid = True
+        except BaseException:
+            valid = False
+        finally:
+            if valid is False:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('ib-loopback', IBLoopback)
diff --git a/superbench/benchmarks/return_code.py b/superbench/benchmarks/return_code.py
index 0991ddb22..da207d01a 100644
--- a/superbench/benchmarks/return_code.py
+++ b/superbench/benchmarks/return_code.py
@@ -28,3 +28,4 @@ class ReturnCode(Enum):
     MICROBENCHMARK_EXECUTION_FAILURE = 32
     MICROBENCHMARK_RESULT_PARSING_FAILURE = 33
     MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE = 34
+    MICROBENCHMARK_DEVICE_GETTING_FAILURE = 35
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 1b15f7c2a..a3db4b6ab 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -28,6 +28,16 @@ superbench:
       model_action:
         - train
   benchmarks:
+    ib-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
+          parallel: yes
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
     mem-bw:
       enable: true
       modes:
diff --git a/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
new file mode 100644
index 000000000..093f41140
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_ib_loopback_performance.py
@@ -0,0 +1,213 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for ib-loopback benchmark."""
+
+import os
+import numbers
+import unittest
+from pathlib import Path
+
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType
+from superbench.common.utils import network
+
+
+class IBLoopbackTest(unittest.TestCase):
+    """Tests for IBLoopback benchmark."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        if (len(network.get_ib_devices()) < 1):
+            # Create fake binary file just for testing.
+            os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
+            binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
+            Path(binary_path).mkdir(parents=True, exist_ok=True)
+            self.__binary_file = Path(os.path.join(binary_path, 'run_perftest_loopback'))
+            self.__binary_file.touch(mode=0o755, exist_ok=True)
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        if (len(network.get_ib_devices()) < 1):
+            self.__binary_file.unlink()
+
+    def test_ib_loopback_performance(self):
+        """Test ib-loopback benchmark."""
+        raw_output = {}
+        raw_output['AF'] = """
+************************************
+* Waiting for client to connect... *
+************************************
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+Dual-port       : OFF          Device         : ibP257p0s0
+Number of qps   : 1            Transport type : IB
+Connection type : RC           Using SRQ      : OFF
+PCIe relax order: ON
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+Dual-port       : OFF          Device         : ibP257p0s0
+Number of qps   : 1            Transport type : IB
+Connection type : RC           Using SRQ      : OFF
+PCIe relax order: ON
+ibv_wr* API     : ON
+TX depth        : 128
+CQ Moderation   : 100
+Mtu             : 4096[B]
+Link type       : IB
+Max inline data : 0[B]
+rdma_cm QPs     : OFF
+Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ibv_wr* API     : ON
+CQ Moderation   : 100
+Mtu             : 4096[B]
+Link type       : IB
+Max inline data : 0[B]
+rdma_cm QPs     : OFF
+Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+local address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+local address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+remote address: LID 0xd06 QPN 0x092e PSN 0x3eb82d RKey 0x080228 VAddr 0x007f19adcbf000
+remote address: LID 0xd06 QPN 0x092f PSN 0x3ff1bc RKey 0x080329 VAddr 0x007fc97ff50000
+---------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------
+#bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+#bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+2          2000             5.32               5.30               2.778732
+4          2000             10.65              10.64              2.788833
+8          2000             21.30              21.27              2.787609
+16         2000             42.60              42.55              2.788268
+32         2000             84.90              82.82              2.713896
+64         2000             173.55             171.66             2.812504
+128        2000             362.27             353.83             2.898535
+256        2000             687.82             679.37             2.782698
+512        2000             1337.12            1311.59            2.686135
+1024       2000             2674.25            2649.39            2.712980
+2048       2000             5248.56            5118.18            2.620509
+4096       2000             10034.02            9948.41                   2.546793
+8192       2000             18620.51            12782.56                  1.636168
+16384      2000             23115.27            16782.50                  1.074080
+32768      2000             22927.94            18586.03                  0.594753
+65536      2000             23330.56            21167.79                  0.338685
+131072     2000             22750.35            21443.14                  0.171545
+262144     2000             22673.63            22411.35                  0.089645
+524288     2000             22679.02            22678.86                  0.045358
+1048576    2000             22817.06            22816.86                  0.022817
+2097152    2000             22919.37            22919.27                  0.011460
+4194304    2000             23277.93            23277.91                  0.005819
+8388608    2000             23240.68            23240.68                  0.002905
+---------------------------------------------------------------------------------------
+8388608    2000             23240.68            23240.68                  0.002905
+---------------------------------------------------------------------------------------
+    """
+        raw_output['S'] = """
+                        RDMA_Write BW Test
+ Dual-port       : OFF		Device         : ibP257p0s0
+ Number of qps   : 1		Transport type : IB
+ Connection type : RC		Using SRQ      : OFF
+ PCIe relax order: ON
+ TX depth        : 128
+ CQ Moderation   : 1
+ Mtu             : 4096[B]
+ Link type       : IB
+ Max inline data : 0[B]
+ rdma_cm QPs	 : OFF
+ Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ local address: LID 0xd06 QPN 0x095f PSN 0x3c9e82 RKey 0x080359 VAddr 0x007f9fc479c000
+ remote address: LID 0xd06 QPN 0x095e PSN 0xbd024b RKey 0x080258 VAddr 0x007fe62504b000
+---------------------------------------------------------------------------------------
+ #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+ 8388608    20000            24056.74            24056.72		   0.003007
+************************************
+* Waiting for client to connect... *
+************************************
+---------------------------------------------------------------------------------------
+                    RDMA_Write BW Test
+ Dual-port       : OFF		Device         : ibP257p0s0
+ Number of qps   : 1		Transport type : IB
+ Connection type : RC		Using SRQ      : OFF
+ PCIe relax order: ON
+ CQ Moderation   : 1
+ Mtu             : 4096[B]
+ Link type       : IB
+ Max inline data : 0[B]
+ rdma_cm QPs	 : OFF
+ Data ex. method : Ethernet
+---------------------------------------------------------------------------------------
+ local address: LID 0xd06 QPN 0x095e PSN 0xbd024b RKey 0x080258 VAddr 0x007fe62504b000
+ remote address: LID 0xd06 QPN 0x095f PSN 0x3c9e82 RKey 0x080359 VAddr 0x007f9fc479c000
+---------------------------------------------------------------------------------------
+ #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
+ 8388608    20000            24056.74            24056.72		   0.003007
+---------------------------------------------------------------------------------------
+
+---------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------
+"""
+        for mode in ['AF', 'S']:
+            # Test without ib devices
+            if (len(network.get_ib_devices()) < 1):
+                # Check registry.
+                benchmark_name = 'ib-loopback'
+                (benchmark_class, predefine_params
+                 ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+                assert (benchmark_class)
+
+                # Check preprocess
+                parameters = '--ib_index 0 --numa 0 --n 2000 --mode ' + mode
+                benchmark = benchmark_class(benchmark_name, parameters=parameters)
+                ret = benchmark._preprocess()
+                assert (ret is False)
+                assert (benchmark.return_code is ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
+
+                assert (benchmark._process_raw_result(0, raw_output[mode]))
+
+            # Test with ib devices
+            else:
+                # Check registry, preprocess and run.
+                parameters = '--ib_index 0 --numa 0 --n 2000 --mode ' + mode
+                context = BenchmarkRegistry.create_benchmark_context('ib-loopback', parameters=parameters)
+
+                assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+                benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+                # Check raw_data.
+                assert (benchmark.run_count == 1)
+                assert (benchmark.return_code == ReturnCode.SUCCESS)
+                assert ('raw_output_0_IB0' in benchmark.raw_data)
+                assert (len(benchmark.raw_data['raw_output_0_IB0']) == 1)
+                assert (isinstance(benchmark.raw_data['raw_output_0_IB0'][0], str))
+
+            # Check function process_raw_data.
+            # Positive case - valid raw output.
+            metric_list = []
+            message_sizes = []
+            if mode == 'AF':
+                message_sizes = ['8388608', '4194304', '2097152', '1048576']
+            elif mode == 'S':
+                message_sizes = [benchmark._args.size]
+            for ib_command in benchmark._args.commands:
+                for size in message_sizes:
+                    metric = 'IB_Avg_{}'.format(str(benchmark._args.ib_index))
+                    metric_list.append(metric)
+            for metric in metric_list:
+                assert (metric in benchmark.result)
+                assert (len(benchmark.result[metric]) == 1)
+                assert (isinstance(benchmark.result[metric][0], numbers.Number))
+
+            # Negative case - Add invalid raw output.
+            assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
+
+            # Check basic information.
+            assert (benchmark.name == 'ib-loopback')
+            assert (benchmark.type == BenchmarkType.MICRO)
+            assert (benchmark._bin_name == 'run_perftest_loopback')
+
+            # Check parameters specified in BenchmarkContext.
+            assert (benchmark._args.ib_index == 0)
+            assert (benchmark._args.numa == 0)
+            assert (benchmark._args.n == 2000)
+            assert (benchmark._args.size == 8388608)
+            assert (benchmark._args.commands == ['ib_write_bw'])
+            assert (benchmark._args.mode == mode)