Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion flang/include/flang/Optimizer/Builder/CUFCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "mlir/IR/BuiltinOps.h"

static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem__";

namespace fir {
class FirOpBuilder;
Expand Down
3 changes: 2 additions & 1 deletion flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,8 @@ def cuf_SharedMemoryOp
OptionalAttr<StrAttr>:$bindc_name, Variadic<AnyIntegerType>:$typeparams,
Variadic<AnyIntegerType>:$shape,
// offset in bytes from the shared memory base address.
Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment);
Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment,
UnitAttr:$isStatic);

let results = (outs fir_ReferenceType:$ptr);

Expand Down
3 changes: 2 additions & 1 deletion flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,8 @@ void cuf::SharedMemoryOp::build(
bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
build(builder, result, wrapAllocaResultType(inType),
mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
/*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{});
/*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{},
/*isStatic=*/nullptr);
result.addAttributes(attributes);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,43 @@ static bool isAssumedSize(mlir::ValueRange shape) {
return false;
}

static void createSharedMemoryGlobal(fir::FirOpBuilder &builder,
mlir::Location loc, llvm::StringRef prefix,
llvm::StringRef suffix,
mlir::gpu::GPUModuleOp gpuMod,
mlir::Type sharedMemType, unsigned size,
unsigned align, bool isDynamic) {
std::string sharedMemGlobalName =
isDynamic ? (prefix + llvm::Twine(cudaSharedMemSuffix)).str()
: (prefix + llvm::Twine(cudaSharedMemSuffix) + suffix).str();

mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToEnd(gpuMod.getBody());

mlir::StringAttr linkage = isDynamic ? builder.createExternalLinkage()
: builder.createInternalLinkage();
llvm::SmallVector<mlir::NamedAttribute> attrs;
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
gpuMod.getContext());
attrs.push_back(mlir::NamedAttribute(
fir::GlobalOp::getDataAttrAttrName(globalOpName),
cuf::DataAttributeAttr::get(gpuMod.getContext(),
cuf::DataAttribute::Shared)));

mlir::DenseElementsAttr init = {};
mlir::Type i8Ty = builder.getI8Type();
if (size > 0) {
auto vecTy = mlir::VectorType::get(
static_cast<fir::SequenceType::Extent>(size), i8Ty);
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
}
auto sharedMem =
fir::GlobalOp::create(builder, loc, sharedMemGlobalName, false, false,
sharedMemType, init, linkage, attrs);
sharedMem.setAlignment(align);
}

struct CUFComputeSharedMemoryOffsetsAndSize
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
CUFComputeSharedMemoryOffsetsAndSize> {
Expand Down Expand Up @@ -108,18 +145,23 @@ struct CUFComputeSharedMemoryOffsetsAndSize
crtDynOffset, dynSize);
else
crtDynOffset = dynSize;

continue;
} else {
// Static shared memory.
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
loc, sharedOp.getInType(), *dl, kindMap);
createSharedMemoryGlobal(
builder, sharedOp.getLoc(), funcOp.getName(),
*sharedOp.getBindcName(), gpuMod,
fir::SequenceType::get(size, i8Ty), size,
sharedOp.getAlignment() ? *sharedOp.getAlignment() : align,
/*isDynamic=*/false);
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
sharedOp.getOffsetMutable().assign(zero);
if (!sharedOp.getAlignment())
sharedOp.setAlignment(align);
sharedOp.setIsStatic(true);
++nbStaticSharedVariables;
}
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
++nbStaticSharedVariables;
mlir::Value offset = builder.createIntegerConstant(
loc, i32Ty, llvm::alignTo(sharedMemSize, align));
sharedOp.getOffsetMutable().assign(offset);
sharedMemSize =
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
alignment = std::max(alignment, align);
}

if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
Expand All @@ -130,35 +172,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize
funcOp.getLoc(),
"static and dynamic shared variables in a single kernel");

mlir::DenseElementsAttr init = {};
if (sharedMemSize > 0) {
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
}
if (nbStaticSharedVariables > 0)
continue;

// Create the shared memory global where each shared variable will point
// to.
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
std::string sharedMemGlobalName =
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
// Dynamic shared memory needs an external linkage while static shared
// memory needs an internal linkage.
mlir::StringAttr linkage = nbDynamicSharedVariables > 0
? builder.createExternalLinkage()
: builder.createInternalLinkage();
builder.setInsertionPointToEnd(gpuMod.getBody());
llvm::SmallVector<mlir::NamedAttribute> attrs;
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
gpuMod.getContext());
attrs.push_back(mlir::NamedAttribute(
fir::GlobalOp::getDataAttrAttrName(globalOpName),
cuf::DataAttributeAttr::get(gpuMod.getContext(),
cuf::DataAttribute::Shared)));
auto sharedMem = fir::GlobalOp::create(
builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
sharedMemType, init, linkage, attrs);
sharedMem.setAlignment(alignment);
createSharedMemoryGlobal(builder, funcOp.getLoc(), funcOp.getName(), "",
gpuMod, sharedMemType, sharedMemSize, alignment,
/*isDynamic=*/true);
}
}
};
Expand Down
7 changes: 6 additions & 1 deletion flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,13 @@ struct CUFSharedMemoryOpConversion
"cuf.shared_memory must have an offset for code gen");

auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();

std::string sharedGlobalName =
(getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
op.getIsStatic()
? (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix) +
*op.getBindcName())
.str()
: (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
mlir::Value sharedGlobalAddr =
createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);

Expand Down
4 changes: 2 additions & 2 deletions flang/test/Fir/CUDA/cuda-code-gen.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ func.func @_QMm1Psub1(%arg0: !fir.box<!fir.array<?xi32>> {cuf.data_attr = #cuf.c

// -----

fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
fir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>

// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>

// -----

Expand Down
23 changes: 14 additions & 9 deletions flang/test/Fir/CUDA/cuda-shared-offset.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
// CHECK: gpu.return
// CHECK: }
// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
// CHECK: fir.global external @_QPdynshared__shared_mem__ {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>

// -----

Expand All @@ -43,15 +43,20 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: gpu.func @_QPshared_static()
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "a", isStatic, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "b", isStatic, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "c", isStatic, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "d", isStatic, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i64 align 8 {bindc_name = "e", isStatic, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] f32 align 4 {bindc_name = "r", isStatic, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
// CHECK: gpu.return
// CHECK: }
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__a(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__b(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__c(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__d(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__e(dense<0> : vector<8xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<8xi8>
// CHECK: fir.global internal @_QPshared_static__shared_mem__r(dense<0> : vector<4xi8>) {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
// CHECK: }
// CHECK: }

Expand Down Expand Up @@ -159,4 +164,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>

// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
// CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
6 changes: 3 additions & 3 deletions flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
%1 = cuf.shared_memory [%c4 : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
llvm.return
}
llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
llvm.mlir.global common @_QPshared_static__shared_mem__(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
}
}

// CHECK-LABEL: llvm.func @_QPshared_static()
// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
// CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr
// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8
// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem__ : !llvm.ptr<3>
// CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr
// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8