-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[flang][cuda] Add double descriptor information in allocate/deallocate operations #170901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAfter #169740, the allocate and deallocate cuf operation can be converted later. Update the way to recognize double descriptor case by adding this information directly on the operation itself. Full diff: https://github.com/llvm/llvm-project/pull/170901.diff 7 Files Affected:
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index ef7cdc42d72f2..704b0356c19ed 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -66,6 +66,9 @@ translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
/// there is a conversion. Return null otherwise.
hlfir::ElementalOp isTransferWithConversion(mlir::Value rhs);
+/// Check if the value is an allocatable with double descriptor.
+bool hasDoubleDescriptor(mlir::Value);
+
} // end namespace Fortran::lower
#endif // FORTRAN_LOWER_CUDA_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 920bef99dc996..766a0d6bb8ee0 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -100,7 +100,8 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
Optional<fir_ReferenceType>:$stream,
Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
- cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat);
+ cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
+ UnitAttr:$hasDoubleDescriptor);
let results = (outs AnyIntegerType:$stat);
@@ -126,9 +127,9 @@ def cuf_DeallocateOp : cuf_Op<"deallocate",
}];
let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
- Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
- cuf_DataAttributeAttr:$data_attr,
- UnitAttr:$hasStat);
+ Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+ cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
+ UnitAttr:$hasDoubleDescriptor);
let results = (outs AnyIntegerType:$stat);
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index e7a6c4df40045..2ae13e2bd73fb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -798,10 +798,12 @@ class AllocateStmtHelper {
// Keep return type the same as a standard AllocatableAllocate call.
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+ bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
return cuf::AllocateOp::create(
builder, loc, retTy, box.getAddr(), errmsg, stream, pinned,
source, cudaAttr,
- errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+ errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
+ doubleDescriptors ? builder.getUnitAttr() : nullptr)
.getResult();
}
@@ -865,11 +867,13 @@ static mlir::Value genCudaDeallocate(fir::FirOpBuilder &builder,
? nullptr
: errorManager.errMsgAddr;
- // Keep return type the same as a standard AllocatableAllocate call.
+ // Keep return type the same as a standard AllocatableDeallocate call.
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+ bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
return cuf::DeallocateOp::create(
builder, loc, retTy, box.getAddr(), errmsg, cudaAttr,
- errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+ errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
+ doubleDescriptors ? builder.getUnitAttr() : nullptr)
.getResult();
}
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 9501b0ec60002..fb055286df46b 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -91,3 +91,17 @@ hlfir::ElementalOp Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
return elOp;
return {};
}
+
+bool Fortran::lower::hasDoubleDescriptor(mlir::Value addr) {
+ if (auto declareOp =
+ mlir::dyn_cast_or_null<hlfir::DeclareOp>(addr.getDefiningOp())) {
+ if (mlir::isa_and_nonnull<fir::AddrOfOp>(
+ declareOp.getMemref().getDefiningOp())) {
+ if (declareOp.getDataAttr() &&
+ *declareOp.getDataAttr() == cuf::DataAttribute::Pinned)
+ return false;
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
index 0acdb24bf62b1..2c40991580c2e 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -62,28 +62,6 @@ static inline unsigned getMemType(cuf::DataAttribute attr) {
llvm_unreachable("unsupported memory type");
}
-template <typename OpTy>
-static bool hasDoubleDescriptors(OpTy op) {
- if (auto declareOp =
- mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp())) {
- if (mlir::isa_and_nonnull<fir::AddrOfOp>(
- declareOp.getMemref().getDefiningOp())) {
- if (isPinned(declareOp))
- return false;
- return true;
- }
- } else if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
- op.getBox().getDefiningOp())) {
- if (mlir::isa_and_nonnull<fir::AddrOfOp>(
- declareOp.getMemref().getDefiningOp())) {
- if (isPinned(declareOp))
- return false;
- return true;
- }
- }
- return false;
-}
-
static bool inDeviceContext(mlir::Operation *op) {
if (op->getParentOfType<cuf::KernelOp>())
return true;
@@ -353,7 +331,7 @@ struct CUFAllocateOpConversion
fir::FortranVariableFlagsEnum::pointer))
isPointer = true;
- if (hasDoubleDescriptors(op)) {
+ if (op.getHasDoubleDescriptor()) {
// Allocation for module variable are done with custom runtime entry point
// so the descriptors can be synchronized.
mlir::func::FuncOp func;
@@ -406,7 +384,7 @@ struct CUFDeallocateOpConversion
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
- if (hasDoubleDescriptors(op)) {
+ if (op.getHasDoubleDescriptor()) {
// Deallocation for module variable are done with custom runtime entry
// point so the descriptors can be synchronized.
mlir::func::FuncOp func =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..eb2816145c77a 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -37,8 +37,8 @@ fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.
func.func @_QPsub3() {
%0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
%1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
- %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
- %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+ %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+ %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
return
}
@@ -109,7 +109,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
%3 = fir.convert %c1 : (index) -> i64
%4 = fir.convert %c10_i32 : (i32) -> i64
fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
- %6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
+ %6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
return
}
@@ -158,7 +158,7 @@ func.func @_QMmod1Pallocate_source_global() {
%2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
%6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
- %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+ %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
return
}
@@ -226,7 +226,7 @@ func.func @_QQpointer_sync() attributes {fir.bindc_name = "test"} {
%3 = fir.convert %c1 : (index) -> i64
%4 = fir.convert %c10_i32 : (i32) -> i64
fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
- %6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
+ %6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
return
}
@@ -246,7 +246,7 @@ func.func @_QMmod1Ppointer_source_global() {
%2 = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
%6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
%7 = fir.load %6 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
- %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+ %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
return
}
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 2cf8c7d336812..393faff6046bc 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -235,3 +235,21 @@ end subroutine
! CHECK-LABEL: func.func @_QPcuda_component()
! CHECK: cuf.allocate
+
+subroutine module_allocate()
+ use globals
+ allocate(a_device(10))
+ allocate(a_managed(10))
+ allocate(a_pinned(10))
+ deallocate(a_device)
+ deallocate(a_managed)
+ deallocate(a_pinned)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmodule_allocate()
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
|
SusanTan
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thank you valentin!
Similar for the double descriptor information added in #170901, we need to carry over the pointer information until the op can be converted. The correct detection would fail if the op is converted late.
… ops (#170937) Similar for the double descriptor information added in llvm/llvm-project#170901, we need to carry over the pointer information until the op can be converted. The correct detection would fail if the op is converted late.
…e operations (llvm#170901) After llvm#169740, the allocate and deallocate cuf operation can be converted later. Update the way to recognize double descriptor case by adding this information directly on the operation itself.
…170937) Similar for the double descriptor information added in llvm#170901, we need to carry over the pointer information until the op can be converted. The correct detection would fail if the op is converted late.
After #169740, the allocate and deallocate cuf operation can be converted later. Update the way to recognize double descriptor case by adding this information directly on the operation itself.