Skip to content

Commit

Permalink
Reapply [test-suite] Add HIP Tests to External
Browse files Browse the repository at this point in the history
Adding simple HIP Tests, with a simple saxpy.cpp example.
Shared functions and macros added to GPUTestVariant.cmake
in cmake/modules. Common functions shared between HIP and
CUDA.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D99997
  • Loading branch information
aaronenyeshi committed Apr 13, 2021
1 parent 42674b5 commit d590d0b
Show file tree
Hide file tree
Showing 10 changed files with 309 additions and 71 deletions.
1 change: 1 addition & 0 deletions External/CMakeLists.txt
@@ -1,4 +1,5 @@
add_subdirectory(CUDA)
add_subdirectory(HIP)
add_subdirectory(HMMER)
add_subdirectory(Nurbs)
add_subdirectory(Povray)
Expand Down
96 changes: 25 additions & 71 deletions External/CUDA/CMakeLists.txt
@@ -1,4 +1,5 @@
include(External)
include(GPUTestVariant)
llvm_externals_find(TEST_SUITE_CUDA_ROOT "cuda" "CUDA prerequisites")

set(SUPPORTED_GPU_CUDA_7_0
Expand Down Expand Up @@ -30,83 +31,36 @@ set(SUPPORTED_GPU_CUDA_10_2 ${SUPPORTED_GPU_CUDA_10_1})
set(SUPPORTED_GPU_CUDA_11_0 ${SUPPORTED_GPU_CUDA_10_2}
sm_80)

# Helper macro to extract version number at the end of the string
# Input: get_version(Var String)
# Where String = /some/string/with/version-x.y.z
# Output:
# Sets Var=x.y.z
macro(get_version Var Path)
string(REGEX MATCH "[0-9]+(\\.[0-9]+)*$" ${Var} ${Path})
endmacro (get_version)

# Helper function to glob CUDA source files and set LANGUAGE property
# to CXX on each of them. Sets Var in parent scope to the list of
# files found.
macro(cuda_glob Var)
file(GLOB FileList ${ARGN})
foreach(File IN LISTS FileList)
if(${File} MATCHES ".*\.cu$")
set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
endif()
endforeach()
set(${Var} ${FileList})
endmacro(cuda_glob)

macro(create_one_local_test_f Name FileGlob FilterRegex)
if (${VariantSuffix} MATCHES ${FilterRegex})
cuda_glob(_sources ${FileGlob})
set(_executable ${Name}-${VariantSuffix})
set(_executable_path ${CMAKE_CURRENT_BINARY_DIR}/${_executable})
llvm_test_run()
set(REFERENCE_OUTPUT)
# Verify reference output if it exists.
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${Name}.reference_output)
set(REFERENCE_OUTPUT ${Name}.reference_output)
llvm_test_verify(WORKDIR ${CMAKE_CURRENT_BINARY_DIR}
${FPCMP} %o ${REFERENCE_OUTPUT}-${VariantSuffix}
)
llvm_test_executable(${_executable} ${_sources})
llvm_test_data(${_executable}
DEST_SUFFIX "-${VariantSuffix}"
${REFERENCE_OUTPUT})
else()
llvm_test_executable(${_executable} ${_sources})
endif()
target_compile_options(${_executable} PUBLIC ${VariantCPPFLAGS})
if(VariantLibs)
target_link_libraries(${_executable} ${VariantLibs})
endif()
add_dependencies(cuda-tests-simple-${VariantSuffix} ${_executable})
# Local tests are presumed to be fast.
list(APPEND CUDA_SIMPLE_TEST_TARGETS ${_executable}.test)
endif()
endmacro()

macro(create_one_local_test Name FileGlob)
create_one_local_test_f(${Name} ${FileGlob} ".*")
endmacro()

# Create targets for CUDA tests that are part of the test suite.
macro(create_local_cuda_tests VariantSuffix)
create_one_local_test(assert assert.cu)
create_one_local_test(axpy axpy.cu)
create_one_local_test(algorithm algorithm.cu)
create_one_local_test(cmath cmath.cu)
create_one_local_test(complex complex.cu)
create_one_local_test(math_h math_h.cu)
create_one_local_test(new new.cu)
create_one_local_test(empty empty.cu)
create_one_local_test(printf printf.cu)
create_one_local_test(future future.cu)
create_one_local_test(builtin_var builtin_var.cu)
# We only need SIMD tests on CUDA-8.0 to verivy that our reference is correct
set(VariantOffload "cuda")
list(APPEND CUDA_LOCAL_TESTS assert)
list(APPEND CUDA_LOCAL_TESTS axpy)
list(APPEND CUDA_LOCAL_TESTS algorithm)
list(APPEND CUDA_LOCAL_TESTS cmath)
list(APPEND CUDA_LOCAL_TESTS complex)
list(APPEND CUDA_LOCAL_TESTS math_h)
list(APPEND CUDA_LOCAL_TESTS new)
list(APPEND CUDA_LOCAL_TESTS empty)
list(APPEND CUDA_LOCAL_TESTS printf)
list(APPEND CUDA_LOCAL_TESTS future)
list(APPEND CUDA_LOCAL_TESTS builtin_var)
list(APPEND CUDA_LOCAL_TESTS test_round)
foreach(_cuda_test IN LISTS CUDA_LOCAL_TESTS)
create_one_local_test(${_cuda_test} ${_cuda_test}.cu
${VariantOffload} ${VariantSuffix}
"${VariantCPPFLAGS}" "${VariantLibs}")
endforeach()

# We only need SIMD tests on CUDA-8.0 to verify that our reference is correct
# and matches NVIDIA-provided one. and on CUDA-9.2 to verify that clang's
# implementation matches the reference. This test also happens to be the
# longest one, so by not running unnecessary instances we speed up cuda
# buildbot a lot.
create_one_local_test_f(simd simd.cu
"cuda-(8[.]0|9[.]2)-c[+][+]11-libc[+][+]")
create_one_local_test(test_round test_round.cu)
"cuda-(8[.]0|9[.]2)-c[+][+]11-libc[+][+]"
${VariantOffload} ${VariantSuffix}
"${VariantCPPFLAGS}" "${VariantLibs}")
endmacro()

macro(thrust_make_test_name TestName TestSourcePath)
Expand Down Expand Up @@ -196,7 +150,7 @@ function(create_cuda_test_variant Std VariantSuffix)
# Target for CUDA tests that take little time to build and run.
add_custom_target(check-cuda-simple-${VariantSuffix}
COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS}
${CUDA_SIMPLE_TEST_TARGETS}
${VARIANT_SIMPLE_TEST_TARGETS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS cuda-tests-simple-${VariantSuffix}
USES_TERMINAL)
Expand Down
98 changes: 98 additions & 0 deletions External/HIP/CMakeLists.txt
@@ -0,0 +1,98 @@
include(External)
include(GPUTestVariant)
llvm_externals_find(TEST_SUITE_HIP_ROOT "hip" "HIP prerequisites")

# Create targets for HIP tests that are part of the test suite.
macro(create_local_hip_tests VariantSuffix)
set(VariantOffload "hip")
# Add HIP tests to be added to hip-tests-simple
list(APPEND HIP_LOCAL_TESTS empty)
list(APPEND HIP_LOCAL_TESTS saxpy)
foreach(_hip_test IN LISTS HIP_LOCAL_TESTS)
create_one_local_test(${_hip_test} ${_hip_test}.hip
${VariantOffload} ${VariantSuffix}
"${VariantCPPFLAGS}" "${VariantLibs}")
endforeach()
endmacro()

function(create_hip_test VariantSuffix)
message(STATUS "Creating HIP test variant ${VariantSuffix}")
add_custom_target(hip-tests-simple-${VariantSuffix}
COMMENT "Build HIP test variant ${VariantSuffix}")

set(VariantCPPFLAGS ${_HIP_CPPFLAGS})
set(VariantLibs ${_HIP_Libs})
list(APPEND LDFLAGS ${_HIP_LDFLAGS})

create_local_hip_tests(${VariantSuffix})
add_dependencies(hip-tests-simple hip-tests-simple-${VariantSuffix})

add_custom_target(check-hip-simple-${VariantSuffix}
COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS}
${VARIANT_SIMPLE_TEST_TARGETS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS hip-tests-simple-${VariantSuffix}
USES_TERMINAL)
add_dependencies(check-hip-simple check-hip-simple-${VariantSuffix})
endfunction(create_hip_test)

macro(create_hip_tests)
# Find all rocm installations at Externals/hip/ directory.
# For ROCm, the path looks like rocm-4.1.0
message(STATUS "Checking HIP prerequisites in ${TEST_SUITE_HIP_ROOT}")
file(GLOB RocmVersions ${TEST_SUITE_HIP_ROOT}/rocm-*)
list(SORT RocmVersions)
foreach(RocmDir IN LISTS RocmVersions)
get_version(RocmVersion ${RocmDir})
message(STATUS "Found ROCm ${RocmVersion}")
list(APPEND ROCM_PATHS ${RocmDir})
add_library(amdhip64-${RocmVersion} SHARED IMPORTED)
set_property(TARGET amdhip64-${RocmVersion} PROPERTY IMPORTED_LOCATION
${RocmDir}/lib/libamdhip64.so)
endforeach(RocmDir)

if(NOT ROCM_PATHS)
message(SEND_ERROR
"There are no ROCm installations in ${TEST_SUITE_HIP_ROOT}")
return()
endif()

add_custom_target(hip-tests-simple
COMMENT "Build all simple HIP tests")
add_custom_target(check-hip-simple
COMMENT "Run all simple HIP tests")

if(NOT AMDGPU_ARCHS)
list(APPEND AMDGPU_ARCHS "gfx906;gfx908")
endif()

foreach(_RocmPath ${ROCM_PATHS})
get_version(_RocmVersion ${_RocmPath})
set(_HIP_Suffix "hip-${_RocmVersion}")
# Set up HIP test flags
set(_HIP_CPPFLAGS -xhip --hip-device-lib-path=${_RocmPath}/amdgcn/bitcode
-I${_RocmPath}/include)
set(_HIP_LDFLAGS -L${_RocmPath}/lib -lamdhip64)
set(_HIP_Libs amdhip64-${RocmVersion})

# Unset these for each iteration of rocm path.
set(_ArchFlags)
set(_ArchList)
foreach(_AMDGPUArch IN LISTS AMDGPU_ARCHS)
list(APPEND _ArchFlags --offload-arch=${_AMDGPUArch})
endforeach()
message(STATUS "Building ${_RocmPath} targets for ${AMDGPU_ARCHS}")
list(APPEND _HIP_CPPFLAGS ${_ArchFlags})

create_hip_test(${_HIP_Suffix})
endforeach()

add_custom_target(hip-tests-all DEPENDS hip-tests-simple
COMMENT "Build all HIP tests.")

file(COPY lit.local.cfg DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
endmacro(create_hip_tests)

if(TEST_SUITE_HIP_ROOT)
create_hip_tests()
endif()
30 changes: 30 additions & 0 deletions External/HIP/README
@@ -0,0 +1,30 @@
HIP Tests
==========

HIP tests are enabled if cmake is invoked with
-DTEST_SUITE_EXTERNALS_DIR=<externals path> and specified externals
directory contains at least one ROCm installation.

Expected externals directory structure:
Externals/
hip/
rocm-X.Y.Z/ -- One or more ROCm installation.

export EXTERNAL_DIR=/your/Externals/path
export AMDGPU_ARCHS=gfx906;gfx908 # List of AMDGPU archs to compile
export CLANG_DIR=/your/clang/build/dir
export TEST_SUITE_DIR=/path/to/test-suite-sources

Configure, build and run tests:

```
$ mkdir test-suite-build-dir
$ cd test-suite-build-dir
$ PATH=$CLANG_DIR/bin:$PATH CXX=clang++ CC=clang cmake -G Ninja -DTEST_SUITE_EXTERNALS_DIR=$EXTERNAL_DIR -DAMDGPU_ARCHS=$AMDGPU_ARCHS -DCMAKE_CXX_COMPILER="$CLANG_DIR/bin/clang++" -DCMAKE_C_COMPILER="$CLANG_DIR/bin/clang" $TEST_SUITE_DIR
$ ninja hip-tests-simple
$ ninja check-hip-simple
```

This will build every test for each of the installed ROCm in the
$EXTERNAL_DIR/hip location, and run them against the expected
reference_output.
1 change: 1 addition & 0 deletions External/HIP/empty.hip
@@ -0,0 +1 @@
int main(int argc, char **argv) { return 0; }
1 change: 1 addition & 0 deletions External/HIP/empty.reference_output
@@ -0,0 +1 @@
exit 0
14 changes: 14 additions & 0 deletions External/HIP/lit.local.cfg
@@ -0,0 +1,14 @@
# -*- python -*-

import os

hip_env_vars = [
'HIP_VISIBLE_DEVICES',
'LD_LIBRARY_PATH',
]

for var in hip_env_vars:
if var in os.environ:
config.environment[var] = os.environ[var]

config.traditional_output = True
62 changes: 62 additions & 0 deletions External/HIP/saxpy.hip
@@ -0,0 +1,62 @@
#include <iostream>

#include <hip/hip_runtime.h>

#define N (1024 * 500)

__global__ void saxpy(float a, float* x, float* y) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) y[tid] = a * x[tid] + y[tid];
}

int main() {

const float a = 100.0f;
float* x = (float*)malloc(N * sizeof(float));
float* y = (float*)malloc(N * sizeof(float));

// Initialize the input data.
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<float>(i);
y[i] = static_cast<float>(i * 2);
}

// Make a copy for the GPU implementation.
float* d_x;
float* d_y;
hipMalloc((void**)&d_x, N * sizeof(float));
hipMalloc((void**)&d_y, N * sizeof(float));
hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice);

// CPU implementation of saxpy.
for (int i = 0; i < N; i++) {
y[i] = a * x[i] + y[i];
}

// Launch a GPU kernel to compute the saxpy.
saxpy<<<(N+255)/256, 256>>>(a, d_x, d_y);

// Copy the device results to host.
float* h_y = (float*)malloc(N * sizeof(float));
hipDeviceSynchronize();
hipMemcpy(h_y, d_y, N * sizeof(float), hipMemcpyDeviceToHost);

// Verify the results match CPU.
int errors = 0;
for (int i = 0; i < N; i++) {
if (fabs(y[i] - h_y[i]) > fabs(y[i] * 0.0001f))
errors++;
}
if (errors != 0)
std::cout << errors << " errors" << std::endl;
else
std::cout << "PASSED!" << std::endl;

free(h_y);
free(x);
free(y);
hipFree(d_x);
hipFree(d_y);
return errors;
}
2 changes: 2 additions & 0 deletions External/HIP/saxpy.reference_output
@@ -0,0 +1,2 @@
PASSED!
exit 0

0 comments on commit d590d0b

Please sign in to comment.