AMDGPU: weird miscompilations when using __ballot HIP function #62477

Epliz · 2023-05-01T09:33:29Z

Hi,

I am facing some weird mis-compilation for the following program when compiling for gfx90c/gfx908:

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>

#include <cstdint>

#if __AMDGCN_WAVEFRONT_SIZE == 32
typedef uint32_t hip_warp_ext_lanemask_t;
#elif __AMDGCN_WAVEFRONT_SIZE == 64
typedef uint64_t hip_warp_ext_lanemask_t;
#else
# error "Unsupported wavefront size"
#endif

// Active threads
static inline __device__ hip_warp_ext_lanemask_t __hip_warp_ext_activemask() {
  return __ballot(1);
}

// Warp vote functions
static inline __device__ int __hip_warp_ext_all(int predicate) {
  return __all(predicate);
}

static inline __device__ int __hip_warp_ext_any(int predicate) {
  return __any(predicate);
}

static inline __device__ hip_warp_ext_lanemask_t __hip_warp_ext_ballot(int predicate) {
  return __ballot(predicate);
}

static inline __device__ hip_warp_ext_lanemask_t __hip_warp_ext_match_any(int value) {
  bool active = true;
  hip_warp_ext_lanemask_t result = 0;

  while (active) {
    // determine what threads have the same value as the currently first active thread
    int first_active_value = __builtin_amdgcn_readfirstlane(value);
    int predicate = (value == first_active_value);
    hip_warp_ext_lanemask_t m = __hip_warp_ext_ballot(predicate); // THIS LINE IS PROBLEMATIC

    // if the current thread has the same value, set its result mask to the current one
    if (predicate) {
      result |= m;
      active = false;
    }
  }

  return result;
}


#include <cstdlib>

static void __assert_true(bool cond, const char* message, const char* file, int line) {
  if (!cond) {
    printf("Assertion failed in %s:%d with message %s\n", file, line, message);
    exit(-1);
  }
}

#define ASSERT_TRUE(cond, message) __assert_true((cond), (message), __FILE__, __LINE__);
#define ASSERT_FALSE(cond, message) __assert_true(!(cond), (message), __FILE__, __LINE__);
#define ASSERT_EQUAL(a, b, message) __assert_true(((a) == (b)), (message), __FILE__, __LINE__);
#define ASSERT_NOT_EQUAL(a, b, message) __assert_true(((a) != (b)), (message), __FILE__, __LINE__);

__global__
void test_warp_vote_functions(hip_warp_ext_lanemask_t* results) {
  results[2] = __hip_warp_ext_ballot(1);
  // will always see one bit set, can be any bit due to race condition
  results[5] = __hip_warp_ext_match_any(threadIdx.x);

  if (threadIdx.x < 16) {
    results[8] = __hip_warp_ext_activemask();
    results[9] = __hip_warp_ext_ballot(1);
  }
}

int main(int argc, char** argv) {
  
  hipSetDevice(0);

  hipDeviceProp_t props;
  hipGetDeviceProperties(&props, 0);

  bool hasWarpVote = props.arch.hasWarpVote != 0;
  printf("Has warpVote: %d\n", props.arch.hasWarpVote);

  size_t num_results = 16;
  size_t results_size = sizeof(hip_warp_ext_lanemask_t) * num_results;

  hip_warp_ext_lanemask_t* results = new hip_warp_ext_lanemask_t[num_results];
  hip_warp_ext_lanemask_t* results_device;

  if (hipMalloc((void**) &results_device, results_size) != hipSuccess) {
    ASSERT_TRUE(false, "HIP allocation failed");
  }

  unsigned num_blocks = 1;
  unsigned threads_per_block = 64;

  dim3 block_dim(threads_per_block);
  dim3 grid_dim(num_blocks);

  test_warp_vote_functions<<<grid_dim, block_dim, 0, 0>>>(results_device);

  if (hipMemcpyDtoH(results, results_device, results_size) != hipSuccess) {
    ASSERT_TRUE(false, "HIP copy failed");
  }

  size_t expected_warp_size = 64;
  size_t full_warp_mask = expected_warp_size == 64 ? 0xFFFFFFFFFFFFFFFFull : 0xFFFFFFFFull;
  size_t half_warp_mask_a = expected_warp_size == 64 ? 0xAAAAAAAAAAAAAAAAull : 0xAAAAAAAAull;
  size_t half_warp_mask_b = expected_warp_size == 64 ? 0x5555555555555555ull : 0x55555555ull;

  if (hasWarpVote) {
    ASSERT_EQUAL(results[2], full_warp_mask, "ballot");

    ASSERT_TRUE(results[5] != 0 && (results[5] & results[5] - 1) == 0, "match_any_none"); // should have one bit set

    // partially active
    printf("mask: %lu\n", results[8]);
    ASSERT_EQUAL(results[8], 0xFFFFull, "partial activemask");
    printf("ballot: %lu\n", results[9]);
    ASSERT_EQUAL(results[9], 0xFFFFull, "partial ballot");
  }

  printf("All tests passed\n");
  return 0;
}

The assertion ASSERT_EQUAL(results[9], 0xFFFFull, "partial ballot"); gets triggered.
I should get the same value in results[8] and results[9] as it is basically the same expression being computed if we inline the different calls.

What is really weird is that if I change __hip_warp_ext_match_any to

static inline __device__ hip_warp_ext_lanemask_t __hip_warp_ext_match_any(int value) {
  bool active = true;
  hip_warp_ext_lanemask_t result = 0;

  while (active) {
    // determine what threads have the same value as the currently first active thread
    int first_active_value = __builtin_amdgcn_readfirstlane(value);
    int predicate = (value == first_active_value);
    hip_warp_ext_lanemask_t m = __ballot(predicate); // CHANGED THIS LINE TO DIRECTLY call __ballot

    // if the current thread has the same value, set its result mask to the current one
    if (predicate) {
      result |= m;
      active = false;
    }
  }

  return result;
}

then it works. However it doesn't inspire confidence to say the least, as it should give the exact same result.

Maybe the "ockl ballot hoisting hack" that I see in the generated assembly of the non-reduced case is at fault?

Using

hipcc --version
HIP version: 5.4.22802-aaa1e3d8
AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.4.1 22465 d6f0fe8b22e3d8ce0f2cbd657ea14b16043018a5)

or

hipcc --version
HIP version: 5.4.22804-474e8620
AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.4.3 23045 a29fe425c7b0e5aba97ed2f95f61fd5ecba68aed)

Best regards,
Epliz

The text was updated successfully, but these errors were encountered:

llvmbot · 2023-05-01T14:31:01Z

@llvm/issue-subscribers-backend-amdgpu

Epliz · 2023-05-01T17:50:13Z

Another perplexing bug (maybe the same?):

static inline __device__ hip_warp_ext_lanemask_t __hip_warp_ext_match_any(int value) {
  bool active = true;
  hip_warp_ext_lanemask_t result_mask = 0;

  while (active) {
    // determine what threads have the same value as the currently first active thread
    int first_active_value = __builtin_amdgcn_readfirstlane(value);
    int predicate = (value == first_active_value);
    hip_warp_ext_lanemask_t mask = __ballot(predicate);

    // if the current thread has the same value, set its result mask to the current one
    if (predicate) {
      result_mask |= mask;
      active = false;
    }
  }

  return result_mask;
}

__global__
void test_warp_vote_functions(size_t* results) {
  results[7] = __hip_warp_ext_match_any(threadIdx.x % 2); // can potentially see two values out

  if (threadIdx.x < 16) {
    results[14] = __hip_warp_ext_match_any(threadIdx.x % 2); // can potentially see two values out
  }
}

results[14] get the same value as results[7] just like if there was no change in which threads were active in the if (threadIdx.x < 16) block.

arsenm · 2023-05-02T00:38:19Z

This is the class of problem convergence tokens are intended to solve https://discourse.llvm.org/t/rfc-introduce-convergence-control-intrinsics/69613/3

Epliz · 2023-05-02T14:28:23Z

OK, glad to see that it is a know issue and that it is being worked on.
Any suggestion how to make it work in the meantime?
Maybe guarding the function entry/exit with some asm volatile statements indicating that the execution masks are being modified or something like that in order to avoid any hoisting or code motion? Or some special function attributes?

arsenm · 2023-05-02T15:30:38Z

Your only options right now are to apply more instances of the "ockl ballot hoisting hack" that you noticed, or add in some wave barrier / subgroup barrier calls depending on the situation. By wrapping the ballot with another function, you're losing the hack in earlier control flow optimizations.

Epliz · 2023-05-02T18:49:41Z

What's a wave barrier in HIP? There doesn't seem to be the __syncwarp from CUDA? Does that correspond to the builtin __builtin_amdgcn_wave_barrier ? If so, doesn't seem to help.
__syncthreads doesn't seem to help either.

How would I apply the "ockl ballot hoisting hack" for code I write? There doesn't seem to be anything in the HIP implementation of __ballot that indicates that this hack should be applied.

arsenm · 2023-05-03T00:07:56Z

By wave barrier, I mean __builtin_amdgcn_wave_barrier which is the analog of __syncwarp. Not sure if HIP currently exposes a proper subgroup API

The hack is just using inline asm with a tied output constraint that copies its input, like:
__asm__ volatile ("" : "=v"(out_val) : "0"(in_val))

This is buried in OCKL, not in the hip headers

Epliz · 2023-05-03T06:12:11Z

Thanks a lot, the hoisting hack works!
Good for me for now.

One last question, is it safe that at https://github.com/ROCm-Developer-Tools/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h the __ballot / __ballot64 methods are not guarded with the ballot hack?

arsenm · 2023-05-03T14:52:53Z

Thanks a lot, the hoisting hack works! Good for me for now.

One last question, is it safe that at https://github.com/ROCm-Developer-Tools/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h the __ballot / __ballot64 methods are not guarded with the ballot hack?

It's not really a principled enough hack to justify spreading it everywhere. As you've discovered, it's at most pushing the symptoms out one layer. We need convergence tokens to solve this in general

Epliz · 2023-08-23T09:00:16Z

Hi @arsenm ,
I see that most of the pull requests related to convergence tokens have been merged in LLVM (except https://reviews.llvm.org/D153744 ).
Does that mean that I can expect to see this bug fixed in the current LLVM or at least the next packaged LLVM from ROCM?
If so I could try and confirm the fix.

arsenm · 2023-08-23T10:49:10Z

Hi @arsenm , I see that most of the pull requests related to convergence tokens have been merged in LLVM (except https://reviews.llvm.org/D153744 ). Does that mean that I can expect to see this bug fixed in the current LLVM or at least the next packaged LLVM from ROCM? If so I could try and confirm the fix.

The base infrastructure has been committed, but clang isn't producing convergence tokens yet cc @sameerds

ssahasra · 2023-08-24T14:52:50Z

Well, the short answer is "don't hold your breathe just yet" for convergence tokens. There is quite some work left in both frontend and backend to get this right.

ssahasra · 2023-08-28T10:31:38Z

I think this bug no longer depends on convergence control tokens, after the following change:

https://reviews.llvm.org/D144756
[SimplifyCFG] Do not hoist/sink convergent function calls

There have been similar recent changes like this in LLVM, that conservatively prevent optimizations around convergent operations. The control tokens will eventually improve these conservative changes to allow some optimizations in the right places. If the bug is still reproducible in the main branch, we should take closer look and just fix it without depending on tokens.

ssahasra · 2023-08-28T16:04:17Z

I just tried with a very recent version of Clang from LLVM main (using HIP_CLANG_PATH in hipcc). The test passed.

Epliz · 2023-09-01T09:10:09Z

@ssahasra , happy to read that :-) .
Makes sense that just avoiding hoisting was sufficient.

I will take your word for it (I won't try on LLVM master myself), and close the issue.
Thanks a lot for your work on this!

github-actions bot added the new issue label May 1, 2023

EugeneZelenko added backend:AMDGPU and removed new issue labels May 1, 2023

Epliz changed the title ~~AMDGPU: weird miscompilation when calling function calling __ballot HIP function~~ AMDGPU: weird miscompilation when using __ballot HIP function May 1, 2023

Epliz changed the title ~~AMDGPU: weird miscompilation when using __ballot HIP function~~ AMDGPU: weird miscompilations when using __ballot HIP function May 1, 2023

Epliz closed this as completed Sep 1, 2023

Epliz mentioned this issue Sep 1, 2023

Missing warp match functions in HIP ROCm/clr#9

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU: weird miscompilations when using __ballot HIP function #62477

AMDGPU: weird miscompilations when using __ballot HIP function #62477

Epliz commented May 1, 2023 •

edited

Loading

llvmbot commented May 1, 2023

Epliz commented May 1, 2023

arsenm commented May 2, 2023

Epliz commented May 2, 2023

arsenm commented May 2, 2023

Epliz commented May 2, 2023

arsenm commented May 3, 2023

Epliz commented May 3, 2023

arsenm commented May 3, 2023

Epliz commented Aug 23, 2023

arsenm commented Aug 23, 2023

ssahasra commented Aug 24, 2023

ssahasra commented Aug 28, 2023

ssahasra commented Aug 28, 2023

Epliz commented Sep 1, 2023

AMDGPU: weird miscompilations when using __ballot HIP function #62477

AMDGPU: weird miscompilations when using __ballot HIP function #62477

Comments

Epliz commented May 1, 2023 • edited Loading

llvmbot commented May 1, 2023

Epliz commented May 1, 2023

arsenm commented May 2, 2023

Epliz commented May 2, 2023

arsenm commented May 2, 2023

Epliz commented May 2, 2023

arsenm commented May 3, 2023

Epliz commented May 3, 2023

arsenm commented May 3, 2023

Epliz commented Aug 23, 2023

arsenm commented Aug 23, 2023

ssahasra commented Aug 24, 2023

ssahasra commented Aug 28, 2023

ssahasra commented Aug 28, 2023

Epliz commented Sep 1, 2023

Epliz commented May 1, 2023 •

edited

Loading