diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h index ae7d566920656..027bd3b79a1df 100644 --- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -60,6 +60,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary { mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef); mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef); void genSyncWarp(llvm::ArrayRef); + mlir::Value genThisCluster(mlir::Type, llvm::ArrayRef); mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef); mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef); mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index f67129dfa6730..c560c53033780 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -457,6 +457,10 @@ static constexpr IntrinsicHandler cudaHandlers[]{ static_cast(&CI::genSyncWarp), {}, /*isElemental=*/false}, + {"this_cluster", + static_cast(&CI::genThisCluster), + {}, + /*isElemental=*/false}, {"this_grid", static_cast(&CI::genThisGrid), {}, @@ -1122,6 +1126,44 @@ void CUDAIntrinsicLibrary::genSyncWarp( mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0])); } +// THIS_CLUSTER +mlir::Value +CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0); + auto recTy = mlir::cast(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // SIZE + mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty); + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + // RANK + mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + + return res; +} + // THIS_GRID mlir::Value CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, diff --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90 index b8875f72f8079..1c89866f9c84a 100644 --- a/flang/module/cooperative_groups.f90 +++ b/flang/module/cooperative_groups.f90 @@ -14,6 +14,12 @@ module cooperative_groups implicit none +type :: cluster_group + type(c_devptr), private :: handle + integer(4) :: size + integer(4) :: rank +end type cluster_group + type :: grid_group type(c_devptr), private :: handle integer(4) :: size @@ -32,6 +38,13 @@ module cooperative_groups integer(4) :: rank end type thread_group +interface + attributes(device) function this_cluster() + import + type(cluster_group) :: this_cluster + end function +end interface + interface attributes(device) function this_grid() import