From cf95e175ccc96fd7868e1300a0536824ad40a174 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Tue, 2 Dec 2025 14:21:26 -0800 Subject: [PATCH] [flang][cuda] Change how to hanlde static shared memory variables --- .../flang/Optimizer/Builder/CUFCommon.h | 2 +- .../flang/Optimizer/Dialect/CUF/CUFOps.td | 3 +- flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 3 +- .../CUFComputeSharedMemoryOffsetsAndSize.cpp | 96 +++++++++++-------- .../Transforms/CUFGPUToLLVMConversion.cpp | 7 +- flang/test/Fir/CUDA/cuda-code-gen.mlir | 4 +- flang/test/Fir/CUDA/cuda-shared-offset.mlir | 23 +++-- flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir | 6 +- 8 files changed, 88 insertions(+), 56 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 98d01958846f7..736f90123969c 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -14,7 +14,7 @@ #include "mlir/IR/BuiltinOps.h" static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod"; -static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; +static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem__"; namespace fir { class FirOpBuilder; diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 3fda523acb382..920bef99dc996 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -351,7 +351,8 @@ def cuf_SharedMemoryOp OptionalAttr:$bindc_name, Variadic:$typeparams, Variadic:$shape, // offset in bytes from the shared memory base address. - Optional:$offset, OptionalAttr:$alignment); + Optional:$offset, OptionalAttr:$alignment, + UnitAttr:$isStatic); let results = (outs fir_ReferenceType:$ptr); diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 671e5f9455c22..97f7f76a8fbe7 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -333,7 +333,8 @@ void cuf::SharedMemoryOp::build( bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName); build(builder, result, wrapAllocaResultType(inType), mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape, - /*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{}); + /*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{}, + /*isStatic=*/nullptr); result.addAttributes(attributes); } diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp index a64494510d847..7bae0602fe5ca 100644 --- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp +++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp @@ -46,6 +46,43 @@ static bool isAssumedSize(mlir::ValueRange shape) { return false; } +static void createSharedMemoryGlobal(fir::FirOpBuilder &builder, + mlir::Location loc, llvm::StringRef prefix, + llvm::StringRef suffix, + mlir::gpu::GPUModuleOp gpuMod, + mlir::Type sharedMemType, unsigned size, + unsigned align, bool isDynamic) { + std::string sharedMemGlobalName = + isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str() + : (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str(); + + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToEnd(gpuMod.getBody()); + + mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage() + : builder.createInternalLinkage(); + llvm::SmallVector attrs; + auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), + gpuMod.getContext()); + attrs.push_back(mlir::NamedAttribute( + fir::GlobalOp::getDataAttrAttrName(globalOpName), + cuf::DataAttributeAttr::get(gpuMod.getContext(), + cuf::DataAttribute::Shared))); + + mlir::DenseElementsAttr init = {}; + mlir::Type i8Ty = builder.getI8Type(); + if (size > 0) { + auto vecTy = mlir::VectorType::get( + static_cast(size), i8Ty); + mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); + init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); + } + auto sharedMem = + fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false, + sharedMemType, init, linkage, attrs); + sharedMem.setAlignment(align); +} + struct CUFComputeSharedMemoryOffsetsAndSize : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase< CUFComputeSharedMemoryOffsetsAndSize> { @@ -108,18 +145,23 @@ struct CUFComputeSharedMemoryOffsetsAndSize crtDynOffset, dynSize); else crtDynOffset = dynSize; - - continue; + } else { + // Static shared memory. + auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( + loc, sharedOp.getInType(), *dl, kindMap); + createSharedMemoryGlobal( + builder, sharedOp.getLoc(), funcOp.getName(), + *sharedOp.getBindcName(), gpuMod, + fir::SequenceType::get(size, i8Ty), size, + sharedOp.getAlignment() ? *sharedOp.getAlignment() : align, + /*isDynamic=*/false); + mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); + sharedOp.getOffsetMutable().assign(zero); + if (!sharedOp.getAlignment()) + sharedOp.setAlignment(align); + sharedOp.setIsStatic(true); + ++nbStaticSharedVariables; } - auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( - sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap); - ++nbStaticSharedVariables; - mlir::Value offset = builder.createIntegerConstant( - loc, i32Ty, llvm::alignTo(sharedMemSize, align)); - sharedOp.getOffsetMutable().assign(offset); - sharedMemSize = - llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align); - alignment = std::max(alignment, align); } if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0) @@ -130,35 +172,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize funcOp.getLoc(), "static and dynamic shared variables in a single kernel"); - mlir::DenseElementsAttr init = {}; - if (sharedMemSize > 0) { - auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty); - mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); - init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); - } + if (nbStaticSharedVariables > 0) + continue; - // Create the shared memory global where each shared variable will point - // to. auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); - std::string sharedMemGlobalName = - (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str(); - // Dynamic shared memory needs an external linkage while static shared - // memory needs an internal linkage. - mlir::StringAttr linkage = nbDynamicSharedVariables > 0 - ? builder.createExternalLinkage() - : builder.createInternalLinkage(); - builder.setInsertionPointToEnd(gpuMod.getBody()); - llvm::SmallVector attrs; - auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), - gpuMod.getContext()); - attrs.push_back(mlir::NamedAttribute( - fir::GlobalOp::getDataAttrAttrName(globalOpName), - cuf::DataAttributeAttr::get(gpuMod.getContext(), - cuf::DataAttribute::Shared))); - auto sharedMem = fir::GlobalOp::create( - builder, funcOp.getLoc(), sharedMemGlobalName, false, false, - sharedMemType, init, linkage, attrs); - sharedMem.setAlignment(alignment); + createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "", + gpuMod, sharedMemType, sharedMemSize, alignment, + /*isDynamic=*/true); } } }; diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp index 40f180a8c1657..d5a8212eb5472 100644 --- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp @@ -249,8 +249,13 @@ struct CUFSharedMemoryOpConversion "cuf.shared_memory must have an offset for code gen"); auto gpuMod = op->getParentOfType(); + std::string sharedGlobalName = - (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str(); + op.getIsStatic() + ? (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix) + + *op.getBindcName()) + .str() + : (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str(); mlir::Value sharedGlobalAddr = createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName); diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index 60cda9e98c7d8..e83648f21bdf1 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -201,9 +201,9 @@ func.func @_QMm1Psub1(%arg0: !fir.box> {cuf.data_attr = #cuf.c // ----- -fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<28xi8> +fir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<28xi8> -// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> +// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> // ----- diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir index 37b36b2bd050e..1a39fefe85cda 100644 --- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir +++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir @@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref> // CHECK: gpu.return // CHECK: } -// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<0xi8> +// CHECK: fir.global external @_QPdynshared__shared_mem__ {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<0xi8> // ----- @@ -43,15 +43,20 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry !fir.ref -// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref -// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref -// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref -// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref -// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "a", isStatic, uniq_name = "_QFshared_staticEa"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "b", isStatic, uniq_name = "_QFshared_staticEb"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "c", isStatic, uniq_name = "_QFshared_staticEc"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "d", isStatic, uniq_name = "_QFshared_staticEd"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i64 align 8 {bindc_name = "e", isStatic, uniq_name = "_QFshared_staticEe"} -> !fir.ref +// CHECK: cuf.shared_memory[%c0{{.*}} : i32] f32 align 4 {bindc_name = "r", isStatic, uniq_name = "_QFshared_staticEr"} -> !fir.ref // CHECK: gpu.return // CHECK: } -// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<28xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__a(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<4xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__b(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<4xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__c(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<4xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__d(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<4xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__e(dense<0> : vector<8xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<8xi8> +// CHECK: fir.global internal @_QPshared_static__shared_mem__r(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda} : !fir.array<4xi8> // CHECK: } // CHECK: } @@ -159,4 +164,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref> // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref> -// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<0xi8> +// CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda} : !fir.array<0xi8> diff --git a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir index 26479d1cdd94f..69370613cd348 100644 --- a/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir +++ b/flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir @@ -9,14 +9,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry !fir.ref llvm.return } - llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> + llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8> } } // CHECK-LABEL: llvm.func @_QPshared_static() -// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3> +// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3> // CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr // CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8 -// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3> +// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3> // CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr // CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8