Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dockerfile - Upgrade to rocm5.7 dockerfile #587

Merged
merged 19 commits into from
Dec 9, 2023
Merged
48 changes: 14 additions & 34 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ on:
jobs:
docker:
name: Docker build ${{ matrix.name }}
runs-on: ubuntu-latest
runs-on: ${{ matrix.runner }}
permissions:
contents: read
packages: write
Expand All @@ -27,54 +27,34 @@ jobs:
- name: cuda12.2
dockerfile: cuda12.2
tags: superbench/main:cuda12.2
runner: ubuntu-latest
- name: cuda11.1.1
yukirora marked this conversation as resolved.
Show resolved Hide resolved
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
- name: rocm5.1.3
dockerfile: rocm5.1.x
tags: superbench/main:rocm5.1.3
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.1.3_ubuntu20.04_py3.7_pytorch_1.11.0
- name: rocm5.1.1
dockerfile: rocm5.1.x
tags: superbench/main:rocm5.1.1
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.1.1_ubuntu20.04_py3.7_pytorch_1.10.0
- name: rocm5.0.1
dockerfile: rocm5.0.x
tags: superbench/main:rocm5.0.1
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.0.1_ubuntu18.04_py3.7_pytorch_1.9.0
- name: rocm5.0
dockerfile: rocm5.0.x
tags: superbench/main:rocm5.0
extra_args: >-
BASE_IMAGE=rocm/pytorch:rocm5.0_ubuntu18.04_py3.7_pytorch_1.9.0
runner: ubuntu-latest
- name: rocm5.7
dockerfile: rocm5.7.x
tags: superbench/main:rocm5.7
runner: [self-hosted, rocm-build]
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Free disk space
run: |
mkdir /tmp/emptydir
mkdir -p /tmp/emptydir
for dir in /usr/share/swift /usr/share/dotnet /usr/local/share/powershell /usr/local/share/chromium /usr/local/lib/android /opt/ghc; do
sudo rsync -a --delete /tmp/emptydir/ ${dir}
done
sudo apt-get clean
sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
# Check if Docker images exist before trying to remove them
if sudo docker images -q --filter=reference="node" --filter=reference="buildpack-deps" | grep -q .; then
sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
else
echo "No Docker images found with the specified references."
fi
df -h
- name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
uses: hirnidrin/free-disk-space@main
with:
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
tool-cache: false
# All of these default to true, but feel free to set to "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: true
- name: Prepare metadata
id: metadata
run: |
Expand Down
175 changes: 175 additions & 0 deletions dockerfile/rocm5.7.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
ARG BASE_IMAGE=rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1

FROM ${BASE_IMAGE}

# OS:
# - Ubuntu: 22.04
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 5.7
# Intel:
# - mlc: v3.10

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=16

# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.26.4" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \
tar xzf cmake-${required_version}.tar.gz && \
cd cmake-${required_version} && \
./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
rm -rf /tmp/cmake-${required_version}* \
else \
echo "CMake version is greater than or equal to 3.23"; \
fi

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf


# Get Ubuntu version and set as an environment variable
RUN export UBUNTU_VERSION=$(lsb_release -r -s)
RUN echo "Ubuntu version: $UBUNTU_VERSION"
ENV UBUNTU_VERSION=${UBUNTU_VERSION}

# Install OFED
ENV OFED_VERSION=5.9-0.5.6.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi

# Install UCX
ENV UCX_VERSION=1.14.1
RUN if [ -z "$(ls -A /opt/ucx)" ]; then \
echo "/opt/ucx is empty. Installing UCX..."; \
cd /tmp && \
git clone https://github.com/openucx/ucx.git -b v${UCX_VERSION} && \
cd ucx && \
./autogen.sh && \
mkdir build && \
cd build && \
../configure -prefix=$UCX_DIR --with-rocm=/opt/rocm --without-knem && \
make -j $(nproc) && make -j $(nproc) install && rm -rf /tmp/ucx-${UCX_VERSION} ; \
else \
echo "/opt/ucx is not empty. Skipping UCX installation."; \
fi

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
# Check if Open MPI is installed
RUN [ -d /usr/local/bin/mpirun ] || { \
echo "Open MPI not found. Installing Open MPI..." && \
cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --enable-mca-no-build=btl-uct --with-ucx=/opt/ucx --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\
}

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

# Install RCCL
RUN cd /opt/ && \
git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && \
make -j${NUM_MAKE_JOBS}

ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/opt/ucx/lib:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

WORKDIR ${SB_HOME}

ADD . .
RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -m pip install .[amdworker] && \
make postinstall
RUN make cppbuild
ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
2 changes: 1 addition & 1 deletion docs/developer-guides/using-docker.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ docker buildx build \
export DOCKER_BUILDKIT=1
docker buildx build \
--platform linux/amd64 --cache-to type=inline,mode=max \
--tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
--tag superbench-dev --file dockerfile/rocm5.7.x.dockerfile .
```

</TabItem>
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def run(self):
**x,
'develop': x['dev'] + x['test'],
'cpuworker': x['torch'],
'amdworker': x['torch'] + x['ort'] + x['amd'],
'amdworker': x['torch'] + x['amd'],
'nvworker': x['torch'] + x['ort'] + x['nvidia'],
}
)(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,24 @@ if(CUDAToolkit_FOUND)
include(../cuda_common.cmake)
add_executable(gpu_copy gpu_copy.cu)
set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(gpu_copy numa)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})

# Convert cuda code to hip code inplace
execute_process(COMMAND hipify-perl -inplace -print-stats gpu_copy.cu
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o gpu_copy.cpp gpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)

# Add HIP targets
set_source_files_properties(gpu_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Link with HIP
hip_add_executable(gpu_copy gpu_copy.cu)
# link hip device lib
add_executable(gpu_copy gpu_copy.cpp)
add_compile_options(-O2)
target_link_libraries(gpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()

install(TARGETS gpu_copy RUNTIME DESTINATION bin)
target_link_libraries(gpu_copy numa)
install(TARGETS gpu_copy RUNTIME DESTINATION bin)
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,16 @@ if(CUDAToolkit_FOUND)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(HIP QUIET)
if(HIP_FOUND)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION})

# Convert cuda code to hip code inplace
execute_process(COMMAND hipify-perl -inplace -print-stats kernel_launch.cu
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o kernel_launch.cpp kernel_launch.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)

# Add HIP targets
set_source_files_properties(kernel_launch.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Link with HIP
hip_add_executable(kernel_launch_overhead kernel_launch.cu)
# link hip device lib
add_executable(kernel_launch_overhead kernel_launch.cpp)
target_link_libraries(kernel_launch_overhead hip::device)
# Install tergets
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()
Expand Down
28 changes: 26 additions & 2 deletions superbench/benchmarks/micro_benchmarks/rocm_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,36 @@

# Set ROCM_PATH
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH /opt/rocm)
# Run hipconfig -p to get ROCm path
execute_process(
COMMAND hipconfig -R
RESULT_VARIABLE HIPCONFIG_RESULT
OUTPUT_VARIABLE ROCM_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)

# Check if hipconfig was successful
if(NOT HIPCONFIG_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to run hipconfig -p. Make sure ROCm is installed and hipconfig is available.")
endif()

else()
set(ROCM_PATH $ENV{ROCM_PATH})
endif()

# Set HIP_PATH
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH ${ROCM_PATH}/hip)
execute_process(
COMMAND hipconfig -p
RESULT_VARIABLE HIPCONFIG_RESULT
OUTPUT_VARIABLE HIP_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)

# Check if hipconfig was successful
if(NOT HIPCONFIG_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to run hipconfig -p. Make sure ROCm is installed and hipconfig is available.")
endif()
else()
set(HIP_PATH $ENV{HIP_PATH})
endif()
Expand All @@ -24,6 +46,8 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH})
# Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
endif()