diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h index 29e7e5bb347d6..1ea129a97a17e 100644 --- a/openmp/libomptarget/DeviceRTL/include/Debug.h +++ b/openmp/libomptarget/DeviceRTL/include/Debug.h @@ -31,6 +31,9 @@ void __assert_fail(const char *expr, const char *msg, const char *file, else \ __assert_assume(expr); \ } +#define UNREACHABLE(msg) \ + PRINT(msg); \ + __builtin_trap(); ///} diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h index 96314e4702f6d..165904644dbb9 100644 --- a/openmp/libomptarget/DeviceRTL/include/Mapping.h +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -18,6 +18,12 @@ namespace ompx { namespace mapping { +enum { + DIM_X = 0, + DIM_Y = 1, + DIM_Z = 2, +}; + #pragma omp begin declare target device_type(nohost) inline constexpr uint32_t MaxThreadsPerTeam = 1024; @@ -63,34 +69,38 @@ LaneMaskTy lanemaskGT(); /// Return the thread Id in the warp, in [0, getWarpSize()). uint32_t getThreadIdInWarp(); -/// Return the thread Id in the block, in [0, getBlockSize()). -uint32_t getThreadIdInBlock(); - -/// Return the warp id in the block. -uint32_t getWarpId(); - /// Return the warp size, thus number of threads in the warp. uint32_t getWarpSize(); +/// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()] +uint32_t getWarpIdInBlock(); + /// Return the number of warps in the block. uint32_t getNumberOfWarpsInBlock(); -/// Return the block Id in the kernel, in [0, getKernelSize()). -uint32_t getBlockId(); +/// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)). +uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); /// Return the block size, thus number of threads in the block. -/// -/// Note: The version taking \p IsSPMD mode explicitly can be used during the -/// initialization of the target region, that is before `mapping::isSPMDMode()` -/// can be called by any thread other than the main one. -uint32_t getBlockSize(); -uint32_t getBlockSize(bool IsSPMD); +uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); + +/// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)). +uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); /// Return the number of blocks in the kernel. -uint32_t getNumberOfBlocks(); +uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); /// Return the kernel size, thus number of threads in the kernel. -uint32_t getKernelSize(); +uint32_t getNumberOfThreadsInKernel(); + +/// Return the maximal number of threads in the block usable for a team (= +/// parallel region). +/// +/// Note: The version taking \p IsSPMD mode explicitly can be used during the +/// initialization of the target region, that is before `mapping::isSPMDMode()` +/// can be called by any thread other than the main one. +uint32_t getMaxTeamThreads(); +uint32_t getMaxTeamThreads(bool IsSPMD); /// Return the number of processing elements on the device. uint32_t getNumberOfProcessorElements(); diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp index 873dccb929ea6..6e296b0a277ca 100644 --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -40,7 +40,8 @@ void __assert_fail(const char *expr, const char *msg, const char *file, DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line, const char *Function) { if (config::isDebugMode(config::DebugKind::FunctionTracing) && - mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) { + mapping::getThreadIdInBlock() == 0 && + mapping::getBlockIdInKernel() == 0) { uint16_t &Level = state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel; @@ -56,7 +57,8 @@ DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line, DebugEntryRAII::~DebugEntryRAII() { if (config::isDebugMode(config::DebugKind::FunctionTracing) && - mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) { + mapping::getThreadIdInBlock() == 0 && + mapping::getBlockIdInKernel() == 0) { uint16_t &Level = state::getKernelEnvironment().DynamicEnv->DebugIndentionLevel; Level--; diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp index bc1d8d5b3d33f..e8f6cfdc6ea54 100644 --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -107,10 +107,11 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) { // reaches its corresponding synchronize::threads call: that would permit all // active worker threads to proceed before the main thread has actually set // state::ParallelRegionFn, and then they would immediately quit without - // doing any work. mapping::getBlockSize() does not include any of the main - // thread's warp, so none of its threads can ever be active worker threads. + // doing any work. mapping::getMaxTeamThreads() does not include any of the + // main thread's warp, so none of its threads can ever be active worker + // threads. if (UseGenericStateMachine && - mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) { + mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) { genericStateMachine(KernelEnvironment.Ident); } else { // Retrieve the work function just to ensure we always call diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp index 78361284ff8de..8f26af086e714 100644 --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -26,16 +26,16 @@ namespace impl { // Forward declarations defined to be defined for AMDGCN and NVPTX. const llvm::omp::GV &getGridValue(); -uint32_t getNumHardwareThreadsInBlock(); LaneMaskTy activemask(); LaneMaskTy lanemaskLT(); LaneMaskTy lanemaskGT(); uint32_t getThreadIdInWarp(); -uint32_t getThreadIdInBlock(); -uint32_t getKernelSize(); -uint32_t getBlockId(); -uint32_t getNumberOfBlocks(); -uint32_t getWarpId(); +uint32_t getThreadIdInBlock(int32_t Dim); +uint32_t getNumberOfThreadsInBlock(int32_t Dim); +uint32_t getNumberOfThreadsInKernel(); +uint32_t getBlockIdInKernel(int32_t Dim); +uint32_t getNumberOfBlocksInKernel(int32_t Dim); +uint32_t getWarpIdInBlock(); uint32_t getNumberOfWarpsInBlock(); /// AMDGCN Implementation @@ -47,8 +47,16 @@ const llvm::omp::GV &getGridValue() { return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); } -uint32_t getNumHardwareThreadsInBlock() { - return __builtin_amdgcn_workgroup_size_x(); +uint32_t getNumberOfThreadsInBlock(int32_t Dim) { + switch (Dim) { + case 0: + return __builtin_amdgcn_workgroup_size_x(); + case 1: + return __builtin_amdgcn_workgroup_size_y(); + case 2: + return __builtin_amdgcn_workgroup_size_z(); + }; + UNREACHABLE("Dim outside range!"); } LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } @@ -73,22 +81,53 @@ uint32_t getThreadIdInWarp() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } -uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } +uint32_t getThreadIdInBlock(int32_t Dim) { + switch (Dim) { + case 0: + return __builtin_amdgcn_workitem_id_x(); + case 1: + return __builtin_amdgcn_workitem_id_y(); + case 2: + return __builtin_amdgcn_workitem_id_z(); + }; + UNREACHABLE("Dim outside range!"); +} -uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } +uint32_t getNumberOfThreadsInKernel() { + return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() * + __builtin_amdgcn_grid_size_z(); +} -uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } +uint32_t getBlockIdInKernel(int32_t Dim) { + switch (Dim) { + case 0: + return __builtin_amdgcn_workgroup_id_x(); + case 1: + return __builtin_amdgcn_workgroup_id_y(); + case 2: + return __builtin_amdgcn_workgroup_id_z(); + }; + UNREACHABLE("Dim outside range!"); +} -uint32_t getNumberOfBlocks() { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); +uint32_t getNumberOfBlocksInKernel(int32_t Dim) { + switch (Dim) { + case 0: + return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); + case 1: + return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); + case 2: + return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); + }; + UNREACHABLE("Dim outside range!"); } -uint32_t getWarpId() { - return impl::getThreadIdInBlock() / mapping::getWarpSize(); +uint32_t getWarpIdInBlock() { + return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); } uint32_t getNumberOfWarpsInBlock() { - return mapping::getBlockSize() / mapping::getWarpSize(); + return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); } #pragma omp end declare variant @@ -101,8 +140,16 @@ uint32_t getNumberOfWarpsInBlock() { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t getNumHardwareThreadsInBlock() { - return __nvvm_read_ptx_sreg_ntid_x(); +uint32_t getNumberOfThreadsInBlock(int32_t Dim) { + switch (Dim) { + case 0: + return __nvvm_read_ptx_sreg_ntid_x(); + case 1: + return __nvvm_read_ptx_sreg_ntid_y(); + case 2: + return __nvvm_read_ptx_sreg_ntid_z(); + }; + UNREACHABLE("Dim outside range!"); } const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; } @@ -125,27 +172,62 @@ LaneMaskTy lanemaskGT() { return Res; } -uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } +uint32_t getThreadIdInBlock(int32_t Dim) { + switch (Dim) { + case 0: + return __nvvm_read_ptx_sreg_tid_x(); + case 1: + return __nvvm_read_ptx_sreg_tid_y(); + case 2: + return __nvvm_read_ptx_sreg_tid_z(); + }; + UNREACHABLE("Dim outside range!"); +} uint32_t getThreadIdInWarp() { - return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); + return impl::getThreadIdInBlock(mapping::DIM_X) & + (mapping::getWarpSize() - 1); } -uint32_t getKernelSize() { - return __nvvm_read_ptx_sreg_nctaid_x() * - mapping::getNumberOfProcessorElements(); +uint32_t getBlockIdInKernel(int32_t Dim) { + switch (Dim) { + case 0: + return __nvvm_read_ptx_sreg_ctaid_x(); + case 1: + return __nvvm_read_ptx_sreg_ctaid_y(); + case 2: + return __nvvm_read_ptx_sreg_ctaid_z(); + }; + UNREACHABLE("Dim outside range!"); } -uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } +uint32_t getNumberOfBlocksInKernel(int32_t Dim) { + switch (Dim) { + case 0: + return __nvvm_read_ptx_sreg_nctaid_x(); + case 1: + return __nvvm_read_ptx_sreg_nctaid_y(); + case 2: + return __nvvm_read_ptx_sreg_nctaid_z(); + }; + UNREACHABLE("Dim outside range!"); +} -uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } +uint32_t getNumberOfThreadsInKernel() { + return impl::getNumberOfThreadsInBlock(0) * + impl::getNumberOfBlocksInKernel(0) * + impl::getNumberOfThreadsInBlock(1) * + impl::getNumberOfBlocksInKernel(1) * + impl::getNumberOfThreadsInBlock(2) * + impl::getNumberOfBlocksInKernel(2); +} -uint32_t getWarpId() { - return impl::getThreadIdInBlock() / mapping::getWarpSize(); +uint32_t getWarpIdInBlock() { + return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); } uint32_t getNumberOfWarpsInBlock() { - return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / mapping::getWarpSize(); } @@ -162,7 +244,7 @@ uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } ///{ static bool isInLastWarp() { - uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) & + uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & ~(mapping::getWarpSize() - 1); return mapping::getThreadIdInBlock() == MainTId; } @@ -203,53 +285,55 @@ uint32_t mapping::getThreadIdInWarp() { return ThreadIdInWarp; } -uint32_t mapping::getThreadIdInBlock() { - uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(); +uint32_t mapping::getThreadIdInBlock(int32_t Dim) { + uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim); return ThreadIdInBlock; } uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } -uint32_t mapping::getBlockSize(bool IsSPMD) { - uint32_t BlockSize = - mapping::getNumberOfProcessorElements() - (!IsSPMD * impl::getWarpSize()); - return BlockSize; +uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { + uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); + // If we are in SPMD mode, remove one warp. + return BlockSize - (!IsSPMD * impl::getWarpSize()); } -uint32_t mapping::getBlockSize() { - return mapping::getBlockSize(mapping::isSPMDMode()); +uint32_t mapping::getMaxTeamThreads() { + return mapping::getMaxTeamThreads(mapping::isSPMDMode()); } -uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } +uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { + return impl::getNumberOfThreadsInBlock(Dim); +} + +uint32_t mapping::getNumberOfThreadsInKernel() { + return impl::getNumberOfThreadsInKernel(); +} -uint32_t mapping::getWarpId() { - uint32_t WarpID = impl::getWarpId(); +uint32_t mapping::getWarpIdInBlock() { + uint32_t WarpID = impl::getWarpIdInBlock(); ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr); return WarpID; } -uint32_t mapping::getBlockId() { - uint32_t BlockId = impl::getBlockId(); - ASSERT(BlockId < impl::getNumberOfBlocks(), nullptr); +uint32_t mapping::getBlockIdInKernel(int32_t Dim) { + uint32_t BlockId = impl::getBlockIdInKernel(Dim); + ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr); return BlockId; } uint32_t mapping::getNumberOfWarpsInBlock() { uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); - ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks, nullptr); + ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr); return NumberOfWarpsInBlocks; } -uint32_t mapping::getNumberOfBlocks() { - uint32_t NumberOfBlocks = impl::getNumberOfBlocks(); - ASSERT(impl::getBlockId() < NumberOfBlocks, nullptr); +uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { + uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim); + ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr); return NumberOfBlocks; } -uint32_t mapping::getNumberOfProcessorElements() { - uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock(); - ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements, nullptr); - return NumberOfProcessorElements; -} +uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); } ///} @@ -279,7 +363,7 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { FunctionTracingRAII(); - return impl::getNumHardwareThreadsInBlock(); + return impl::getNumberOfThreadsInBlock(mapping::DIM_X); } __attribute__((noinline)) uint32_t __kmpc_get_warp_size() { @@ -287,4 +371,5 @@ __attribute__((noinline)) uint32_t __kmpc_get_warp_size() { return impl::getWarpSize(); } } + #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp index 82b944a8bd0a2..84d8f88105f10 100644 --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -49,7 +49,7 @@ namespace { uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { uint32_t NThreadsICV = NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; - uint32_t NumThreads = mapping::getBlockSize(); + uint32_t NumThreads = mapping::getMaxTeamThreads(); if (NThreadsICV != 0 && NThreadsICV < NumThreads) NumThreads = NThreadsICV; @@ -110,8 +110,8 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, ASSERT(state::HasThreadState == false, nullptr); uint32_t NumThreads = determineNumberOfThreads(num_threads); - uint32_t BlockSize = mapping::getBlockSize(); - uint32_t PTeamSize = NumThreads == BlockSize ? 0 : NumThreads; + uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); + uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; if (mapping::isSPMDMode()) { // Avoid the race between the read of the `icv::Level` above and the write // below by synchronizing all threads here. diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp index f544928e33c40..8e0b91bfc1eb1 100644 --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -91,7 +91,7 @@ static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 uint32_t WarpsNeeded = (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); - uint32_t WarpId = mapping::getWarpId(); + uint32_t WarpId = mapping::getWarpIdInBlock(); // Volta execution model: // For the Generic execution mode a parallel region either has 1 thread and diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp index f824ea2809c21..c18368da72d6c 100644 --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -93,7 +93,7 @@ struct SharedMemorySmartStackTy { private: /// Compute the size of the storage space reserved for a thread. uint32_t computeThreadStorageTotal() { - uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); + uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), Alignment); } @@ -269,7 +269,7 @@ void state::enterDataEnvironment(IdentTy *Ident) { static_cast(__kmpc_alloc_shared(sizeof(ThreadStateTy))); uintptr_t *ThreadStatesBitsPtr = reinterpret_cast(&ThreadStates); if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) { - uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize(); + uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getMaxTeamThreads(); void *ThreadStatesPtr = memory::allocGlobal(Bytes, "Thread state array allocation"); if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0), @@ -322,7 +322,7 @@ void state::assumeInitialState(bool IsSPMD) { int state::getEffectivePTeamSize() { int PTeamSize = state::ParallelTeamSize; - return PTeamSize ? PTeamSize : mapping::getBlockSize(); + return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); } extern "C" { @@ -334,7 +334,7 @@ void omp_set_num_threads(int V) { icv::NThreads = V; } int omp_get_max_threads(void) { int NT = icv::NThreads; - return NT > 0 ? NT : mapping::getBlockSize(); + return NT > 0 ? NT : mapping::getMaxTeamThreads(); } int omp_get_level(void) { @@ -373,7 +373,7 @@ int omp_get_num_threads(void) { return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); } -int omp_get_thread_limit(void) { return mapping::getBlockSize(); } +int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } @@ -415,9 +415,9 @@ int omp_get_num_devices(void) { return config::getNumDevices(); } int omp_get_device_num(void) { return config::getDeviceNum(); } -int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } +int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } -int omp_get_team_num() { return mapping::getBlockId(); } +int omp_get_team_num() { return mapping::getBlockIdInKernel(); } int omp_get_initial_device(void) { return -1; } } diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp index 36536b7a81a16..cb60aab25babf 100644 --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -385,7 +385,7 @@ void setLock(omp_lock_t *Lock) { for (;;) { now = __nvvm_read_ptx_sreg_clock(); int32_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= OMP_SPIN * mapping::getBlockId()) { + if (cycles >= OMP_SPIN * mapping::getBlockIdInKernel()) { break; } }