Release - SuperBench v0.8.0 (#517)

**Description** Cherry-pick bug fixes from v0.8.0 to main. **Major Revisions** * Monitor - Fix the cgroup version checking logic (#502) * Benchmark - Fix matrix size overflow issue in cuBLASLt GEMM (#503) * Fix wrong torch usage in communication wrapper for Distributed Inference Benchmark (#505) * Analyzer: Fix bug in python3.8 due to pandas api change (#504) * Bug - Fix bug to get metric from cmd when error happens (#506) * Monitor - Collect realtime GPU power when benchmarking (#507) * Add num_workers argument in model benchmark (#511) * Remove unreachable condition when write host list (#512) * Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513) * Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515) * Docs - Upgrade version and release note (#508) Co-authored-by: guoshzhao <guzhao@microsoft.com> Co-authored-by: Ziyue Yang <ziyyang@microsoft.com> Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
microsoft · Apr 14, 2023 · 51761b3 · 51761b3
1 parent 97c9a41
commit 51761b3
Show file tree

Hide file tree

Showing 41 changed files with 265 additions and 162 deletions.
diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
@@ -24,9 +24,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cuda11.8
-          dockerfile: cuda11.8
-          tags: superbench/main:cuda11.8
+        - name: cuda12.1
+          dockerfile: cuda12.1
+          tags: superbench/main:cuda12.1
         - name: cuda11.1.1
           dockerfile: cuda11.1.1
           tags: superbench/main:cuda11.1.1,superbench/superbench:latest

diff --git a/dockerfile/cuda11.8.dockerfile → dockerfile/cuda12.1.dockerfile b/dockerfile/cuda11.8.dockerfile → dockerfile/cuda12.1.dockerfile
@@ -1,18 +1,18 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+FROM nvcr.io/nvidia/pytorch:23.03-py3
 
 # OS:
 #   - Ubuntu: 20.04
 #   - OpenMPI: 4.1.5a1
 #   - Docker Client: 20.10.8
 # NVIDIA:
-#   - CUDA: 11.8.0
-#   - cuDNN: 8.7.0.84
-#   - NCCL: v2.15.5-1
+#   - CUDA: 12.1.0
+#   - cuDNN: 8.8.1.3
+#   - NCCL: v2.17.1-1
 # Mellanox:
-#   - OFED: 5.2-2.2.3.0
-#   - HPC-X: v2.8.3
+#   - OFED: 5.2-2.2.3.0 # TODO
+#   - HPC-X: v2.14
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
 # Install OFED
 ENV OFED_VERSION=5.2-2.2.3.0
 RUN cd /tmp && \
-    wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
+ENV HPCX_VERSION=v2.14
 RUN cd /opt && \
     rm -rf hpcx && \
-    wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
-    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
+    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
+    rm hpcx.tbz
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
 
-ENV PATH="${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
-    SB_HOME=/opt/superbench \
-    SB_MICRO_PATH=/opt/superbench \
-    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
-    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
-
-RUN echo PATH="$PATH" > /etc/environment && \
-    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
-    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
-
 # Install AOCC compiler
 RUN cd /tmp && \
     wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
@@ -115,6 +105,18 @@ RUN cd /tmp && \
     mv amd-blis /opt/AMD && \
     rm -rf aocl-blis-linux-aocc-4.0.tar.gz
 
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx
@@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the
 export DOCKER_BUILDKIT=1
 docker buildx build \
   --platform linux/amd64 --cache-to type=inline,mode=max \
-  --tag superbench-dev --file dockerfile/cuda11.1.1.dockerfile .
+  --tag superbench-dev --file dockerfile/cuda12.1.dockerfile .
 ```
 
 </TabItem>
@@ -39,7 +39,7 @@ docker buildx build \
 export DOCKER_BUILDKIT=1
 docker buildx build \
   --platform linux/amd64 --cache-to type=inline,mode=max \
-  --tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile .
+  --tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
 ```
 
 </TabItem>

diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
@@ -45,7 +45,7 @@ but it is not strictly necessary.
 
 ```bash
 # create a new virtual environment
-python3 -m venv --system-site-packages ./venv
+python3 -m venv ./venv
 # activate the virtual environment
 source ./venv/bin/activate
 
@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
 :::note Note
 You should checkout corresponding tag to use release version, for example,
 
-`git clone -b v0.7.0 https://github.com/microsoft/superbenchmark`
+`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark`
 :::
 
 ```bash

diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md
@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
 :::note Note
 You should deploy corresponding Docker image to use release version, for example,
 
-`sb deploy -f local.ini -i superbench/superbench:v0.7.0-cuda11.1.1`
+`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1`
 
 You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.
 

diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx
@@ -70,7 +70,7 @@ superbench:
 <TabItem value='example'>
 
 ```yaml
-version: v0.7
+version: v0.8
 superbench:
   enable: benchmark_1
   monitor:

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
 
 #### Metrics
 
-| Name                | Unit               | Description                                                                |
-|---------------------|--------------------|----------------------------------------------------------------------------|
-| cpu-hpl/tests_pass  |                    | HPL completed running and correctness test has passed (1: pass, 0: fail).  |
-| cpu-hpl/throughput  | bandwidth (GFlops) | Compute bandwidth.                                                         |
-| cpu-hpl/time        | time (s)           | Time elapsed during HPL run.                                               |
+| Name               | Unit               | Description                                                               |
+|--------------------|--------------------|---------------------------------------------------------------------------|
+| cpu-hpl/tests_pass |                    | HPL completed running and correctness test has passed (1: pass, 0: fail). |
+| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth.                                                        |
+| cpu-hpl/time       | time (s)           | Time elapsed during HPL run.                                              |
 
 ### `cpu-stream`
 
@@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 
 | Name                                                                    | Unit             | Description                                                         |
 |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us)        | Former NUMA to latter NUMA memory latency.                          |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns)        | Former NUMA to latter NUMA memory latency.                          |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
 
 ### `mem-bw`
 

diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx
@@ -29,6 +29,8 @@ available tags are listed below for all stable versions.
 
 | Tag               | Description                        |
 |-------------------|------------------------------------|
+| v0.8.0-cuda12.1   | SuperBench v0.8.0 with CUDA 12.1   |
+| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 |
 | v0.7.0-cuda11.8   | SuperBench v0.7.0 with CUDA 11.8   |
 | v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 |
 | v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 |
@@ -43,6 +45,10 @@ available tags are listed below for all stable versions.
 
 | Tag                           | Description                                      |
 |-------------------------------|--------------------------------------------------|
+| v0.8.0-rocm5.1.3              | SuperBench v0.8.0 with ROCm 5.1.3                |
+| v0.8.0-rocm5.1.1              | SuperBench v0.8.0 with ROCm 5.1.1                |
+| v0.8.0-rocm5.0.1              | SuperBench v0.8.0 with ROCm 5.0.1                |
+| v0.8.0-rocm5.0                | SuperBench v0.8.0 with ROCm 5.0                  |
 | v0.7.0-rocm5.1.3              | SuperBench v0.7.0 with ROCm 5.1.3                |
 | v0.7.0-rocm5.1.1              | SuperBench v0.7.0 with ROCm 5.1.1                |
 | v0.7.0-rocm5.0.1              | SuperBench v0.7.0 with ROCm 5.0.1                |

diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md
@@ -65,7 +65,7 @@ superbench:
 example:
 ```yaml
 # SuperBench rules
-version: v0.7
+version: v0.8
 superbench:
   rules:
     failure-rule:

diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md
@@ -58,7 +58,7 @@ superbench:
 
 ```yaml title="Example"
 # SuperBench rules
-version: v0.7
+version: v0.8
 superbench:
   rules:
     kernel_launch:

diff --git a/superbench/__init__.py b/superbench/__init__.py
@@ -6,5 +6,5 @@
 Provide hardware and software benchmarks for AI systems.
 """
 
-__version__ = '0.7.0'
+__version__ = '0.8.0'
 __author__ = 'Microsoft'
diff --git a/superbench/analyzer/data_analysis.py b/superbench/analyzer/data_analysis.py
@@ -31,11 +31,13 @@ def statistic(raw_data_df):
         logger.warning('DataAnalyzer: empty data.')
         return data_statistics_df
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         data_statistics_df = raw_data_df.describe()
-        data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01)
-        data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05)
-        data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95)
-        data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99)
+        data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01, numeric_only=True)
+        data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05, numeric_only=True)
+        data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95, numeric_only=True)
+        data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99, numeric_only=True)
         statistics_error = []
         for column in list(raw_data_df.columns):
             if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all():
@@ -122,6 +124,8 @@ def correlation(raw_data_df):
         logger.warning('DataAnalyzer: empty data.')
         return data_corr_df
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         data_corr_df = raw_data_df.corr()
         statistics_error = []
         for column in list(raw_data_df.columns):
@@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir):
         output_dir (str): the directory of output file
     """
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         if not isinstance(raw_data_df, pd.DataFrame):
             logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
             return

diff --git a/superbench/analyzer/data_diagnosis.py b/superbench/analyzer/data_diagnosis.py
@@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path
                 logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.')
             file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data')
             file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules)
-            writer.save()
+            writer.close()
         except Exception as e:
             logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e)))
 

diff --git a/superbench/analyzer/result_summary.py b/superbench/analyzer/result_summary.py
@@ -117,7 +117,7 @@ def _merge_summary(self, summary):
         summary_df = pd.DataFrame()
         for category in summary:
             for i in range(len(summary[category])):
-                summary_df = summary_df.append([summary[category][i]], ignore_index=True)
+                summary_df = pd.concat([summary_df, pd.DataFrame([summary[category][i]])], ignore_index=True)
         return summary_df
 
     def _generate_summary(self, round):
@@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path):
                 file_handler.merge_column_in_excel(worksheet, row, 1)
             else:
                 logger.error('ResultSummary: excel_data_output - summary is empty.')
-            writer.save()
+            writer.close()
         except Exception as e:
             logger.error('ResultSummary: excel_data_output - {}'.format(str(e)))
 

diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
@@ -88,20 +88,21 @@ template <typename T> cudaDataType_t get_datatype() {
 }
 
 template <typename Ta, typename Tb, typename Tout>
-float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
+float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter) {
     // init matrix
     Ta *matrix_a = nullptr;
     Tb *matrix_b = nullptr;
     Tout *matrix_out = nullptr;
-    cudaMalloc(&matrix_a, m * k * std::max(batch, 1) * sizeof(Ta));
-    cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb));
-    cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout));
+    batch = std::max<size_t>(batch, 1);
+    cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
+    cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
+    cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));
 
-    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1));
-    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1));
+    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
+    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);
 
     // init gemm
-    int lda = k, ldb = k, ldd = m;
+    size_t lda = k, ldb = k, ldd = m;
     std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
     gemm->Init();
     gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),