Skip to content

Commit

Permalink
[Libomptarget] Remove remaining inline assembly from the device RTL (#…
Browse files Browse the repository at this point in the history
…79922)

Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
  • Loading branch information
jhuber6 committed Jan 30, 2024
1 parent 77e5136 commit 6aed6cc
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 61 deletions.
2 changes: 1 addition & 1 deletion openmp/libomptarget/DeviceRTL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
else()
libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
endif()
Expand Down
18 changes: 3 additions & 15 deletions openmp/libomptarget/DeviceRTL/src/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,23 +154,11 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {

const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }

LaneMaskTy activemask() {
unsigned int Mask;
asm("activemask.b32 %0;" : "=r"(Mask));
return Mask;
}
LaneMaskTy activemask() { return __nvvm_activemask(); }

LaneMaskTy lanemaskLT() {
__kmpc_impl_lanemask_t Res;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
return Res;
}
LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }

LaneMaskTy lanemaskGT() {
__kmpc_impl_lanemask_t Res;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
return Res;
}
LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }

uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
Expand Down
5 changes: 2 additions & 3 deletions openmp/libomptarget/DeviceRTL/src/Misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,8 @@ double getWTick() {
}

double getWTime() {
unsigned long long nsecs;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
return (double)nsecs * getWTick();
uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
return static_cast<double>(nsecs) * getWTick();
}

#pragma omp end declare variant
Expand Down
7 changes: 2 additions & 5 deletions openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,10 +345,7 @@ void namedBarrier() {
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
constexpr int BarrierNo = 7;
asm volatile("barrier.sync %0, %1;"
:
: "r"(BarrierNo), "r"(NumThreads)
: "memory");
__nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
}

void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
Expand All @@ -361,7 +358,7 @@ void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }

void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
__nvvm_barrier_sync(BarrierNo);
}

void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
Expand Down
39 changes: 2 additions & 37 deletions openmp/libomptarget/DeviceRTL/src/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,52 +22,17 @@ using namespace ompx;
namespace impl {

bool isSharedMemPtr(const void *Ptr) { return false; }
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
uint64_t Pack(uint32_t LowBits, uint32_t HighBits);

/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
*HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
*HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}

#pragma omp end declare variant
///}

/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
uint32_t LowBitsLocal, HighBitsLocal;
asm("mov.b64 {%0,%1}, %2;"
: "=r"(LowBitsLocal), "=r"(HighBitsLocal)
: "l"(Val));
*LowBits = LowBitsLocal;
*HighBits = HighBitsLocal;
}

uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
uint64_t Val;
asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
return Val;
}

#pragma omp end declare variant
///}

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
Expand Down

0 comments on commit 6aed6cc

Please sign in to comment.