Skip to content

Commit

Permalink
Release - SuperBench v0.8.0 (#517)
Browse files Browse the repository at this point in the history
**Description**

Cherry-pick bug fixes from v0.8.0 to main.

**Major Revisions**

* Monitor - Fix the cgroup version checking logic (#502)
* Benchmark - Fix matrix size overflow issue in cuBLASLt GEMM (#503)
* Fix wrong torch usage in communication wrapper for Distributed
Inference Benchmark (#505)
* Analyzer: Fix bug in python3.8 due to pandas api change (#504)
* Bug - Fix bug to get metric from cmd when error happens (#506)
* Monitor - Collect realtime GPU power when benchmarking (#507)
* Add num_workers argument in model benchmark (#511)
* Remove unreachable condition when write host list (#512)
* Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513)
* Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515)
* Docs - Upgrade version and release note (#508)

Co-authored-by: guoshzhao <guzhao@microsoft.com>
Co-authored-by: Ziyue Yang <ziyyang@microsoft.com>
Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
  • Loading branch information
4 people committed Apr 14, 2023
1 parent 97c9a41 commit 51761b3
Show file tree
Hide file tree
Showing 41 changed files with 265 additions and 162 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
strategy:
matrix:
include:
- name: cuda11.8
dockerfile: cuda11.8
tags: superbench/main:cuda11.8
- name: cuda12.1
dockerfile: cuda12.1
tags: superbench/main:cuda12.1
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
Expand Down
50 changes: 26 additions & 24 deletions dockerfile/cuda11.8.dockerfile → dockerfile/cuda12.1.dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
FROM nvcr.io/nvidia/pytorch:22.12-py3
FROM nvcr.io/nvidia/pytorch:23.03-py3

# OS:
# - Ubuntu: 20.04
# - OpenMPI: 4.1.5a1
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 11.8.0
# - cuDNN: 8.7.0.84
# - NCCL: v2.15.5-1
# - CUDA: 12.1.0
# - cuDNN: 8.8.1.3
# - NCCL: v2.17.1-1
# Mellanox:
# - OFED: 5.2-2.2.3.0
# - HPC-X: v2.8.3
# - OFED: 5.2-2.2.3.0 # TODO
# - HPC-X: v2.14
# Intel:
# - mlc: v3.9a
# - mlc: v3.10

LABEL maintainer="SuperBench"

Expand Down Expand Up @@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
# Install OFED
ENV OFED_VERSION=5.2-2.2.3.0
RUN cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.14
RUN cd /opt && \
rm -rf hpcx && \
wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
rm hpcx.tbz

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
Expand All @@ -115,6 +105,18 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz


ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

# Add config files
ADD dockerfile/etc /opt/microsoft/

Expand Down
4 changes: 2 additions & 2 deletions docs/developer-guides/using-docker.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the
export DOCKER_BUILDKIT=1
docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/cuda11.1.1.dockerfile .
--tag superbench-dev --file dockerfile/cuda12.1.dockerfile .
```

</TabItem>
Expand All @@ -39,7 +39,7 @@ docker buildx build \
export DOCKER_BUILDKIT=1
docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile .
--tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
```

</TabItem>
Expand Down
4 changes: 2 additions & 2 deletions docs/getting-started/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ but it is not strictly necessary.

```bash
# create a new virtual environment
python3 -m venv --system-site-packages ./venv
python3 -m venv ./venv
# activate the virtual environment
source ./venv/bin/activate

Expand All @@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
:::note Note
You should checkout corresponding tag to use release version, for example,

`git clone -b v0.7.0 https://github.com/microsoft/superbenchmark`
`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark`
:::

```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/getting-started/run-superbench.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
:::note Note
You should deploy corresponding Docker image to use release version, for example,

`sb deploy -f local.ini -i superbench/superbench:v0.7.0-cuda11.1.1`
`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1`

You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.

Expand Down
2 changes: 1 addition & 1 deletion docs/superbench-config.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ superbench:
<TabItem value='example'>

```yaml
version: v0.7
version: v0.8
superbench:
enable: benchmark_1
monitor:
Expand Down
24 changes: 12 additions & 12 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer

#### Metrics

| Name | Unit | Description |
|---------------------|--------------------|----------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |
| Name | Unit | Description |
|--------------------|--------------------|---------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |

### `cpu-stream`

Expand Down Expand Up @@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/

| Name | Unit | Description |
|-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us) | Former NUMA to latter NUMA memory latency. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns) | Former NUMA to latter NUMA memory latency. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |

### `mem-bw`

Expand Down
6 changes: 6 additions & 0 deletions docs/user-tutorial/container-images.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ available tags are listed below for all stable versions.

| Tag | Description |
|-------------------|------------------------------------|
| v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 |
| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 |
| v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 |
| v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 |
| v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 |
Expand All @@ -43,6 +45,10 @@ available tags are listed below for all stable versions.

| Tag | Description |
|-------------------------------|--------------------------------------------------|
| v0.8.0-rocm5.1.3 | SuperBench v0.8.0 with ROCm 5.1.3 |
| v0.8.0-rocm5.1.1 | SuperBench v0.8.0 with ROCm 5.1.1 |
| v0.8.0-rocm5.0.1 | SuperBench v0.8.0 with ROCm 5.0.1 |
| v0.8.0-rocm5.0 | SuperBench v0.8.0 with ROCm 5.0 |
| v0.7.0-rocm5.1.3 | SuperBench v0.7.0 with ROCm 5.1.3 |
| v0.7.0-rocm5.1.1 | SuperBench v0.7.0 with ROCm 5.1.1 |
| v0.7.0-rocm5.0.1 | SuperBench v0.7.0 with ROCm 5.0.1 |
Expand Down
2 changes: 1 addition & 1 deletion docs/user-tutorial/data-diagnosis.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ superbench:
example:
```yaml
# SuperBench rules
version: v0.7
version: v0.8
superbench:
rules:
failure-rule:
Expand Down
2 changes: 1 addition & 1 deletion docs/user-tutorial/result-summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ superbench:

```yaml title="Example"
# SuperBench rules
version: v0.7
version: v0.8
superbench:
rules:
kernel_launch:
Expand Down
2 changes: 1 addition & 1 deletion superbench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
Provide hardware and software benchmarks for AI systems.
"""

__version__ = '0.7.0'
__version__ = '0.8.0'
__author__ = 'Microsoft'
14 changes: 10 additions & 4 deletions superbench/analyzer/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ def statistic(raw_data_df):
logger.warning('DataAnalyzer: empty data.')
return data_statistics_df
try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
data_statistics_df = raw_data_df.describe()
data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01)
data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05)
data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95)
data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99)
data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01, numeric_only=True)
data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05, numeric_only=True)
data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95, numeric_only=True)
data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99, numeric_only=True)
statistics_error = []
for column in list(raw_data_df.columns):
if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all():
Expand Down Expand Up @@ -122,6 +124,8 @@ def correlation(raw_data_df):
logger.warning('DataAnalyzer: empty data.')
return data_corr_df
try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
data_corr_df = raw_data_df.corr()
statistics_error = []
for column in list(raw_data_df.columns):
Expand Down Expand Up @@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir):
output_dir (str): the directory of output file
"""
try:
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
raw_data_df = raw_data_df.dropna(axis=1, how='all')
if not isinstance(raw_data_df, pd.DataFrame):
logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
return
Expand Down
2 changes: 1 addition & 1 deletion superbench/analyzer/data_diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path
logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.')
file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data')
file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules)
writer.save()
writer.close()
except Exception as e:
logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e)))

Expand Down
4 changes: 2 additions & 2 deletions superbench/analyzer/result_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def _merge_summary(self, summary):
summary_df = pd.DataFrame()
for category in summary:
for i in range(len(summary[category])):
summary_df = summary_df.append([summary[category][i]], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([summary[category][i]])], ignore_index=True)
return summary_df

def _generate_summary(self, round):
Expand Down Expand Up @@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path):
file_handler.merge_column_in_excel(worksheet, row, 1)
else:
logger.error('ResultSummary: excel_data_output - summary is empty.')
writer.save()
writer.close()
except Exception as e:
logger.error('ResultSummary: excel_data_output - {}'.format(str(e)))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,21 @@ template <typename T> cudaDataType_t get_datatype() {
}

template <typename Ta, typename Tb, typename Tout>
float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter) {
// init matrix
Ta *matrix_a = nullptr;
Tb *matrix_b = nullptr;
Tout *matrix_out = nullptr;
cudaMalloc(&matrix_a, m * k * std::max(batch, 1) * sizeof(Ta));
cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb));
cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout));
batch = std::max<size_t>(batch, 1);
cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));

init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1));
init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1));
init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);

// init gemm
int lda = k, ldb = k, ldd = m;
size_t lda = k, ldb = k, ldd = m;
std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
gemm->Init();
gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),
Expand Down

0 comments on commit 51761b3

Please sign in to comment.