Skip to content

Commit

Permalink
Initial implementation of gfx942 (kokkos#6358)
Browse files Browse the repository at this point in the history
* Initial implementation of gfx942

Change-Id: Id31ca3ba5356d021cade2abc3e3f51f9f3b4d211

* remove VEGA arch

Change-Id: I1454bb0b91518bfcf7a04506e40b98387cdf8ed9

* apply formatting

Change-Id: Id9c03fe451d1d28a3c23a77f161a2600f016c7e4

* Fix conditional

Co-authored-by: Daniel Arndt <arndtd@ornl.gov>

* More cmake fixes

Co-authored-by: Damien L-G <dalg24+github@gmail.com>

* remove unneeded for old naming schema

Change-Id: Ibd028fddeedf8e0fdda50b72625ab62cee6fa71e

---------

Co-authored-by: Nicholas Curtis <nicurtis@amd.com>
Co-authored-by: Daniel Arndt <arndtd@ornl.gov>
Co-authored-by: Damien L-G <dalg24+github@gmail.com>
  • Loading branch information
4 people committed Aug 16, 2023
1 parent 8d8b24a commit 04d5c55
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 4 deletions.
7 changes: 6 additions & 1 deletion Makefile.kokkos
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ KOKKOS_DEVICES ?= "Threads"
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX1030, GFX1100
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
KOKKOS_ARCH ?= ""
Expand Down Expand Up @@ -1091,6 +1091,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
Expand Down
1 change: 1 addition & 0 deletions cmake/KokkosCore_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
#cmakedefine KOKKOS_ARCH_AMD_GFX906
#cmakedefine KOKKOS_ARCH_AMD_GFX908
#cmakedefine KOKKOS_ARCH_AMD_GFX90A
#cmakedefine KOKKOS_ARCH_AMD_GFX942
#cmakedefine KOKKOS_ARCH_AMD_GFX1030
#cmakedefine KOKKOS_ARCH_AMD_GFX1100
#cmakedefine KOKKOS_ARCH_AMD_GPU
Expand Down
3 changes: 3 additions & 0 deletions cmake/kokkos_arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K
ENDIF()

# AMD archs ordered in decreasing priority of autodetection
LIST(APPEND SUPPORTED_AMD_GPUS MI300)
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942)
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942)
LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100)
LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908)
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908)
Expand Down
2 changes: 1 addition & 1 deletion core/src/HIP/Kokkos_HIP_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace Impl {

struct HIPTraits {
#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
defined(KOKKOS_ARCH_AMD_GFX90A)
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942)
static int constexpr WarpSize = 64;
static int constexpr WarpIndexMask = 0x003f; /* hexadecimal for 63 */
static int constexpr WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/
Expand Down
7 changes: 6 additions & 1 deletion core/src/HIP/Kokkos_HIP_ReduceScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ struct HIPReductionsFunctor<FunctorType, true> {
}
scalar_intra_warp_reduction(functor, value, false, warp_size,
*my_global_team_buffer_element);
__threadfence();
}
}

Expand Down Expand Up @@ -182,7 +183,10 @@ struct HIPReductionsFunctor<FunctorType, false> {
scalar_intra_warp_reduction(
functor, my_shared_team_buffer_element, false,
blockDim.x * blockDim.y / HIPTraits::WarpSize);
if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
if (threadIdx.x + threadIdx.y == 0) {
*result = *shared_team_buffer_element;
if (skip) __threadfence();
}
}
}

Expand Down Expand Up @@ -382,6 +386,7 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
for (size_t i = threadIdx.y; i < word_count.value; i += blockDim.y) {
global[i] = shared[i];
}
__threadfence();
}

// Contributing blocks note that their contribution has been completed via an
Expand Down
1 change: 1 addition & 0 deletions core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction(
pointer_type global =
reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x;
*global = value;
__threadfence();
}

// One warp of last block performs inter block reduction through loading the
Expand Down
2 changes: 1 addition & 1 deletion core/unit_test/TestMathematicalFunctions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
// FIXME_OPENMPTARGET FIXME_AMD
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && \
(defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
defined(KOKKOS_ARCH_AMD_GFX90A))
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942))

TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
#endif
Expand Down
1 change: 1 addition & 0 deletions generate_makefile.bash
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ display_help_text() {
echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906"
echo " AMD_GFX908 = AMD GPU MI100 GFX908"
echo " AMD_GFX90A = AMD GPU MI200 GFX90A"
echo " AMD_GFX942 = AMD GPU MI300 GFX942"
echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030"
echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100"
echo " [ARM]"
Expand Down

0 comments on commit 04d5c55

Please sign in to comment.