Skip to content

Commit

Permalink
Remove cuda::ptx::mapa (NVIDIA#1442)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen authored and miscco committed Feb 29, 2024
1 parent 793f509 commit aac4d6e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 91 deletions.
35 changes: 26 additions & 9 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
| [`cvta`] | No |
| [`cvt`] | No |
| [`cvt.pack`] | No |
| [`mapa`] | CTK-FUTURE, CCCL v2.4.0 |
| [`mapa`] | No |
| [`getctarank`] | CTK-FUTURE, CCCL v2.4.0 |

[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
Expand Down Expand Up @@ -421,15 +421,32 @@ int main() {

- PTX ISA: [`mapa`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa)

**mapa**:
This instruction can [currently not be
implemented](https://github.com/NVIDIA/cccl/issues/1414) by libcu++. The
instruction can be accessed through the cooperative groups
[cluster_group](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group)
API:

Usage:
```cuda
// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename Tp>
__device__ static inline Tp* mapa(
cuda::ptx::space_cluster_t,
const Tp* addr,
uint32_t target_cta);
#include <cooperative_groups.h>
__cluster_dims__(2)
__global__ void kernel() {
__shared__ int x;
x = 1;
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
cluster.sync();
// Get address of remote shared memory value:
unsigned int other_block_rank = cluster.block_rank() ^ 1;
int * remote_x = cluster.map_shared_rank(&bar, other_block_rank);
// Write to remote value:
*remote_x = 2;
}
```

#### `getctarank`
Expand Down
36 changes: 0 additions & 36 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,42 +542,6 @@ _CCCL_DEVICE static inline void st_async(
// 9.7.8.22. Data Movement and Conversion Instructions: mapa
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa

/*
// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename Tp>
__device__ static inline Tp* mapa(
cuda::ptx::space_cluster_t,
const Tp* addr,
uint32_t target_cta);
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__();
template <typename _Tp>
_CCCL_DEVICE static inline _Tp* mapa(
space_cluster_t,
const _Tp* __addr,
_CUDA_VSTD::uint32_t __target_cta)
{
// __space == space_cluster (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
_CUDA_VSTD::uint32_t __dest;
asm (
"mapa.shared::cluster.u32 %0, %1, %2;"
: "=r"(__dest)
: "r"(__as_ptr_smem(__addr)),
"r"(__target_cta)
:
);
return __from_ptr_dsmem<_Tp>(__dest);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_mapa_is_not_supported_before_SM_90__();
return __from_ptr_dsmem<_Tp>(0);
));
}
#endif // __cccl_ptx_isa >= 780

// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
/*
Expand Down
46 changes: 0 additions & 46 deletions libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp

This file was deleted.

0 comments on commit aac4d6e

Please sign in to comment.