Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
template <const char *fctName, int extent>
fir::ExtendedValue genLDXXFunc(mlir::Type,
Expand Down
37 changes: 37 additions & 0 deletions flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{
&CI::genNVVMTime<mlir::NVVM::Clock64Op>),
{},
/*isElemental=*/false},
{"cluster_dim_blocks",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genClusterDimBlocks),
{},
/*isElemental=*/false},
{"fence_proxy_async",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genFenceProxyAsync),
Expand Down Expand Up @@ -985,6 +990,38 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
.getResult(0);
}

// CLUSTER_DIM_BLOCKS
mlir::Value
CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);

auto insertDim = [&](mlir::Value dim, unsigned fieldPos) {
auto fieldName = recTy.getTypeList()[fieldPos].first;
mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
mlir::Value fieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, fieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value coord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(fieldTy), res, fieldIndex);
fir::StoreOp::create(builder, loc, dim, coord);
};

mlir::Type i32Ty = builder.getI32Type();
mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
insertDim(x, 0);
mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
insertDim(y, 1);
mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
insertDim(z, 2);

return res;
}

// FENCE_PROXY_ASYNC
void CUDAIntrinsicLibrary::genFenceProxyAsync(
llvm::ArrayRef<fir::ExtendedValue> args) {
Expand Down
7 changes: 7 additions & 0 deletions flang/module/cooperative_groups.f90
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ module cooperative_groups
integer(4) :: rank
end type thread_group

interface
attributes(device) function cluster_dim_blocks()
import
type(dim3) :: cluster_dim_blocks
end function
end interface

interface
attributes(device) function this_cluster()
import
Expand Down
34 changes: 34 additions & 0 deletions flang/test/Lower/CUDA/cuda-cluster.cuf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s

attributes(global) subroutine test_this_cluster()
use cooperative_groups
type(cluster_group) :: cluster

cluster = this_cluster()
end subroutine

! CHECK-LABEL: func.func @_QPtest_this_cluster() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: %{{.*}} = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group
! CHECK: %[[RES:.*]] = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>
! CHECK: %[[RANK:.*]] = nvvm.read.ptx.sreg.cluster.ctarank : i32
! CHECK: %[[RANK_1:.*]] = arith.addi %[[RANK]], %c1{{.*}} : i32
! CHECK: %[[RANK_COORD:.*]] = fir.coordinate_of %[[RES]], rank : (!fir.ref<!fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>>) -> !fir.ref<i32>
! CHECK: fir.store %[[RANK_1]] to %[[RANK_COORD]] : !fir.ref<i32>

attributes(global) subroutine test_cluster_dim_blocks()
use cooperative_groups
type(dim3) :: clusterDim

clusterDim = cluster_dim_blocks()
end subroutine

! CHECK-LABEL: func.func @_QPtest_cluster_dim_blocks() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
! CHECK: fir.store %[[X]] to %[[COORD_X]] : !fir.ref<i32>
! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
! CHECK: fir.store %[[Y]] to %[[COORD_Y]] : !fir.ref<i32>
! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
! CHECK: fir.store %[[Z]] to %[[COORD_Z]] : !fir.ref<i32>