diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h index 027bd3b79a1df..cedc7a9437eb5 100644 --- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -47,6 +47,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary { void genBarrierInit(llvm::ArrayRef); mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef); mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef); + mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef); void genFenceProxyAsync(llvm::ArrayRef); template fir::ExtendedValue genLDXXFunc(mlir::Type, diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index c560c53033780..a770e2d9cdeff 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -368,6 +368,11 @@ static constexpr IntrinsicHandler cudaHandlers[]{ &CI::genNVVMTime), {}, /*isElemental=*/false}, + {"cluster_dim_blocks", + static_cast( + &CI::genClusterDimBlocks), + {}, + /*isElemental=*/false}, {"fence_proxy_async", static_cast( &CI::genFenceProxyAsync), @@ -985,6 +990,38 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, .getResult(0); } +// CLUSTER_DIM_BLOCKS +mlir::Value +CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0); + auto recTy = mlir::cast(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + + auto insertDim = [&](mlir::Value dim, unsigned fieldPos) { + auto fieldName = recTy.getTypeList()[fieldPos].first; + mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value fieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, fieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value coord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(fieldTy), res, fieldIndex); + fir::StoreOp::create(builder, loc, dim, coord); + }; + + mlir::Type i32Ty = builder.getI32Type(); + mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty); + insertDim(x, 0); + mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty); + insertDim(y, 1); + mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty); + insertDim(z, 2); + + return res; +} + // FENCE_PROXY_ASYNC void CUDAIntrinsicLibrary::genFenceProxyAsync( llvm::ArrayRef args) { diff --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90 index 1c89866f9c84a..2631975837a5b 100644 --- a/flang/module/cooperative_groups.f90 +++ b/flang/module/cooperative_groups.f90 @@ -38,6 +38,13 @@ module cooperative_groups integer(4) :: rank end type thread_group +interface + attributes(device) function cluster_dim_blocks() + import + type(dim3) :: cluster_dim_blocks + end function +end interface + interface attributes(device) function this_cluster() import diff --git a/flang/test/Lower/CUDA/cuda-cluster.cuf b/flang/test/Lower/CUDA/cuda-cluster.cuf new file mode 100644 index 0000000000000..51cc4208a35de --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-cluster.cuf @@ -0,0 +1,34 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +attributes(global) subroutine test_this_cluster() + use cooperative_groups + type(cluster_group) :: cluster + + cluster = this_cluster() +end subroutine + +! CHECK-LABEL: func.func @_QPtest_this_cluster() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: %{{.*}} = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group +! CHECK: %[[RES:.*]] = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}> +! CHECK: %[[RANK:.*]] = nvvm.read.ptx.sreg.cluster.ctarank : i32 +! CHECK: %[[RANK_1:.*]] = arith.addi %[[RANK]], %c1{{.*}} : i32 +! CHECK: %[[RANK_COORD:.*]] = fir.coordinate_of %[[RES]], rank : (!fir.ref}>,size:i32,rank:i32}>>) -> !fir.ref +! CHECK: fir.store %[[RANK_1]] to %[[RANK_COORD]] : !fir.ref + +attributes(global) subroutine test_cluster_dim_blocks() + use cooperative_groups + type(dim3) :: clusterDim + + clusterDim = cluster_dim_blocks() +end subroutine + +! CHECK-LABEL: func.func @_QPtest_cluster_dim_blocks() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.x : i32 +! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref>) -> !fir.ref +! CHECK: fir.store %[[X]] to %[[COORD_X]] : !fir.ref +! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.y : i32 +! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref>) -> !fir.ref +! CHECK: fir.store %[[Y]] to %[[COORD_Y]] : !fir.ref +! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.z : i32 +! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref>) -> !fir.ref +! CHECK: fir.store %[[Z]] to %[[COORD_Z]] : !fir.ref