diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp index ff1a225d66ce9..662703dfb6321 100644 --- a/flang-rt/lib/cuda/allocatable.cpp +++ b/flang-rt/lib/cuda/allocatable.cpp @@ -57,26 +57,34 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream, int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFAllocatableAllocate)( alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - alloc, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(alloc, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { - int stat{RTNAME(CUFAllocatableAllocateSync)( - alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { + int stat; + if (sourceIsDevice) { + stat = RTNAME(CUFAllocatableAllocate)( + alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine); + } else { + stat = RTNAME(CUFAllocatableAllocateSync)( + alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine); + } if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - alloc, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(alloc, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp index d3f5cfe8e96a1..f07b1a9b60924 100644 --- a/flang-rt/lib/cuda/pointer.cpp +++ b/flang-rt/lib/cuda/pointer.cpp @@ -56,26 +56,28 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream, int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFPointerAllocate)( pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - pointer, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(pointer, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer, const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat, - const Descriptor *errMsg, const char *sourceFile, int sourceLine) { + const Descriptor *errMsg, const char *sourceFile, int sourceLine, + bool sourceIsDevice) { int stat{RTNAME(CUFPointerAllocateSync)( pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)}; if (stat == StatOk) { Terminator terminator{sourceFile, sourceLine}; - Fortran::runtime::DoFromSourceAssign( - pointer, source, terminator, &MemmoveHostToDevice); + Fortran::runtime::DoFromSourceAssign(pointer, source, terminator, + sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice); } return stat; } diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 636879f28a2fb..34ac21c51b933 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -100,8 +100,9 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments, Optional:$stream, Arg, "", [MemWrite]>:$pinned, Arg, "", [MemRead]>:$source, - cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat, - UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer); + OptionalAttr:$data_attr, UnitAttr:$hasStat, + UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer, + UnitAttr:$device_source); let results = (outs AnyIntegerType:$stat); diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h index 6c97afa9e10e8..97f24bc34bfb8 100644 --- a/flang/include/flang/Runtime/CUDA/allocatable.h +++ b/flang/include/flang/Runtime/CUDA/allocatable.h @@ -34,14 +34,16 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr, int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform allocation of the descriptor with synchronization of it when /// necessary. Assign data from source. int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform deallocation of the descriptor with synchronization of it when /// necessary. diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h index bdfc3268e0814..b845fd59114d4 100644 --- a/flang/include/flang/Runtime/CUDA/pointer.h +++ b/flang/include/flang/Runtime/CUDA/pointer.h @@ -34,14 +34,16 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr, int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); /// Perform allocation of the descriptor with synchronization of it when /// necessary. Assign data from source. int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer, const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false, const Descriptor *errMsg = nullptr, - const char *sourceFile = nullptr, int sourceLine = 0); + const char *sourceFile = nullptr, int sourceLine = 0, + bool sourceIsDevice = false); } // extern "C" diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index c9a9d935bd615..030439550cd15 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -629,9 +629,10 @@ class AllocateStmtHelper { unsigned allocatorIdx = Fortran::lower::getAllocatorIdx(alloc.getSymbol()); fir::ExtendedValue exv = isSource ? sourceExv : moldExv; + bool sourceIsDevice = false; if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)}) if (Fortran::semantics::IsCUDADevice(*sym)) - TODO(loc, "CUDA Fortran: allocate with device source"); + sourceIsDevice = true; // Generate a sequence of runtime calls. errorManager.genStatCheck(builder, loc); @@ -651,7 +652,7 @@ class AllocateStmtHelper { genSetDeferredLengthParameters(alloc, box); genAllocateObjectBounds(alloc, box); mlir::Value stat; - if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) { + if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()) || sourceIsDevice) { stat = genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol()); } else { @@ -798,13 +799,19 @@ class AllocateStmtHelper { // Keep return type the same as a standard AllocatableAllocate call. mlir::Type retTy = fir::runtime::getModel()(builder.getContext()); + bool isSourceDevice = false; + if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)}) + if (Fortran::semantics::IsCUDADevice(*sym)) + isSourceDevice = true; + bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr()); return cuf::AllocateOp::create( builder, loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr, errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr, doubleDescriptors ? builder.getUnitAttr() : nullptr, - box.isPointer() ? builder.getUnitAttr() : nullptr) + box.isPointer() ? builder.getUnitAttr() : nullptr, + isSourceDevice ? builder.getUnitAttr() : nullptr) .getResult(); } diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp index 6579c2362cd87..4444fc61239ea 100644 --- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp @@ -99,7 +99,6 @@ static mlir::LogicalResult convertOpToCall(OpTy op, mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true) : builder.createBool(loc, false); - mlir::Value errmsg; if (op.getErrmsg()) { errmsg = op.getErrmsg(); @@ -116,12 +115,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op, loc, fir::ReferenceType::get( mlir::IntegerType::get(op.getContext(), 1))); if (op.getSource()) { + mlir::Value isDeviceSource = op.getDeviceSource() + ? builder.createBool(loc, true) + : builder.createBool(loc, false); mlir::Value stream = op.getStream() ? op.getStream() : builder.createNullConstant(loc, fTy.getInput(2)); args = fir::runtime::createArguments( builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned, - hasStat, errmsg, sourceFile, sourceLine); + hasStat, errmsg, sourceFile, sourceLine, isDeviceSource); } else { mlir::Value stream = op.getStream() ? op.getStream() diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir index 9d0d181609ada..5184561a03e67 100644 --- a/flang/test/Fir/CUDA/cuda-allocate.fir +++ b/flang/test/Fir/CUDA/cuda-allocate.fir @@ -128,11 +128,14 @@ func.func @_QPallocate_source() { %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"} + %devsource = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"} %4 = fir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref>>>) -> !fir.ref>>> %5 = cuf.alloc !fir.box>> {bindc_name = "a_d", data_attr = #cuf.cuda, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref>>> %7 = fir.declare %5 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref>>>) -> !fir.ref>>> %8 = fir.load %4 : !fir.ref>>> %22 = cuf.allocate %7 : !fir.ref>>> source(%8 : !fir.box>>) {data_attr = #cuf.cuda} -> i32 + %9 = fir.load %devsource : !fir.ref>>> + %23 = cuf.allocate %7 : !fir.ref>>> source(%9 : !fir.box>>) {device_source} -> i32 return } @@ -142,8 +145,8 @@ func.func @_QPallocate_source() { // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref>>> // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box>>) -> !fir.box -// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.ref, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 - +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.ref, !fir.ref, i1, !fir.box, !fir.ref, i32, i1) -> i32 +// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %true{{.*}}) fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda} : !fir.box>> { %c0 = arith.constant 0 : index diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf deleted file mode 100644 index 3e59e2f01119e..0000000000000 --- a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf +++ /dev/null @@ -1,9 +0,0 @@ -! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s - -program main - implicit none - integer, device, allocatable :: a_d(:) - integer, allocatable :: a(:) -! CHECK: not yet implemented: CUDA Fortran: allocate with device source - allocate(a, source=a_d) -end program diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 43e716532ecca..52303d126b8dc 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -261,3 +261,12 @@ end subroutine ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda, hasDoubleDescriptor} -> i32 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda, hasDoubleDescriptor} -> i32 ! CHECK: cuf.deallocate %{{.*}} : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 + +attributes(global) subroutine from_device_source() + real, device, allocatable :: a(:) + real, allocatable :: b(:) + allocate(b, source=a) +end subroutine + +! CHECK-LABEL: func.func @_QPfrom_device_source() +! CHECK: cuf.allocate{{.*}}device_source