diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index 8c872a0579c8e..29065c9f01ecf 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -19,6 +19,7 @@ #include "flang/Evaluate/type.h" #include "flang/Parser/message.h" #include "flang/Semantics/attr.h" +#include "flang/Semantics/scope.h" #include "flang/Semantics/symbol.h" #include #include @@ -1240,6 +1241,35 @@ inline bool HasCUDAAttrs(const Expr &expr) { return false; } +/// Check if the expression is a mix of host and device variables that require +/// implicit data transfer. +inline bool HasCUDAImplicitTransfer(const Expr &expr) { + unsigned hostSymbols{0}; + unsigned deviceSymbols{0}; + for (const Symbol &sym : CollectSymbols(expr)) { + if (const auto *details = + sym.GetUltimate().detailsIf()) { + if (details->cudaDataAttr()) { + ++deviceSymbols; + } else { + if (sym.owner().IsDerivedType()) { + if (const auto *details = + sym.owner() + .GetSymbol() + ->GetUltimate() + .detailsIf()) { + if (details->cudaDataAttr()) { + ++deviceSymbols; + } + } + } + ++hostSymbols; + } + } + } + return hostSymbols > 0 && deviceSymbols > 0; +} + } // namespace Fortran::evaluate namespace Fortran::semantics { diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 5bba0978617c7..478c8f4c17ec4 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -3710,16 +3710,18 @@ class FirConverter : public Fortran::lower::AbstractConverter { return false; } - static void genCUDADataTransfer(fir::FirOpBuilder &builder, - mlir::Location loc, bool lhsIsDevice, - hlfir::Entity &lhs, bool rhsIsDevice, - hlfir::Entity &rhs) { + void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc, + const Fortran::evaluate::Assignment &assign, + hlfir::Entity &lhs, hlfir::Entity &rhs) { + bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs); + bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs); if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue()) TODO(loc, "CUDA data transfler with descriptors"); + + // device = host if (lhsIsDevice && !rhsIsDevice) { auto transferKindAttr = fir::CUDADataTransferKindAttr::get( builder.getContext(), fir::CUDADataTransferKind::HostDevice); - // device = host if (!rhs.isVariable()) { auto associate = hlfir::genAssociateExpr( loc, builder, rhs, rhs.getType(), ".cuf_host_tmp"); @@ -3732,7 +3734,73 @@ class FirConverter : public Fortran::lower::AbstractConverter { } return; } - TODO(loc, "Assignement with CUDA Fortran variables"); + + // host = device + if (!lhsIsDevice && rhsIsDevice) { + auto transferKindAttr = fir::CUDADataTransferKindAttr::get( + builder.getContext(), fir::CUDADataTransferKind::DeviceHost); + if (!rhs.isVariable()) { + // evaluateRhs loads scalar. Look for the memory reference to be used in + // the transfer. + if (mlir::isa_and_nonnull(rhs.getDefiningOp())) { + auto loadOp = mlir::dyn_cast(rhs.getDefiningOp()); + builder.create(loc, loadOp.getMemref(), lhs, + transferKindAttr); + return; + } + } else { + builder.create(loc, rhs, lhs, + transferKindAttr); + } + return; + } + + if (lhsIsDevice && rhsIsDevice) { + assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal"); + auto transferKindAttr = fir::CUDADataTransferKindAttr::get( + builder.getContext(), fir::CUDADataTransferKind::DeviceDevice); + builder.create(loc, rhs, lhs, transferKindAttr); + return; + } + llvm_unreachable("Unhandled CUDA data transfer"); + } + + llvm::SmallVector + genCUDAImplicitDataTransfer(fir::FirOpBuilder &builder, mlir::Location loc, + const Fortran::evaluate::Assignment &assign) { + llvm::SmallVector temps; + localSymbols.pushScope(); + auto transferKindAttr = fir::CUDADataTransferKindAttr::get( + builder.getContext(), fir::CUDADataTransferKind::DeviceHost); + unsigned nbDeviceResidentObject = 0; + for (const Fortran::semantics::Symbol &sym : + Fortran::evaluate::CollectSymbols(assign.rhs)) { + if (const auto *details = + sym.GetUltimate() + .detailsIf()) { + if (details->cudaDataAttr()) { + if (sym.owner().IsDerivedType() && IsAllocatable(sym.GetUltimate())) + TODO(loc, "Device resident allocatable derived-type component"); + // TODO: This should probably being checked in semantic and give a + // proper error. + assert( + nbDeviceResidentObject <= 1 && + "Only one reference to the device resident object is supported"); + auto addr = getSymbolAddress(sym); + hlfir::Entity entity{addr}; + auto [temp, cleanup] = + hlfir::createTempFromMold(loc, builder, entity); + auto needCleanup = fir::getIntIfConstant(cleanup); + if (needCleanup && *needCleanup) + temps.push_back(temp); + addSymbol(sym, temp, /*forced=*/true); + builder.create(loc, addr, temp, + transferKindAttr); + ++nbDeviceResidentObject; + } + } + } + return temps; } void genDataAssignment( @@ -3741,8 +3809,13 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Location loc = getCurrentLocation(); fir::FirOpBuilder &builder = getFirOpBuilder(); - bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs); - bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs); + bool isCUDATransfer = Fortran::evaluate::HasCUDAAttrs(assign.lhs) || + Fortran::evaluate::HasCUDAAttrs(assign.rhs); + bool hasCUDAImplicitTransfer = + Fortran::evaluate::HasCUDAImplicitTransfer(assign.rhs); + llvm::SmallVector implicitTemps; + if (hasCUDAImplicitTransfer) + implicitTemps = genCUDAImplicitDataTransfer(builder, loc, assign); // Gather some information about the assignment that will impact how it is // lowered. @@ -3800,12 +3873,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::StatementContext localStmtCtx; hlfir::Entity rhs = evaluateRhs(localStmtCtx); hlfir::Entity lhs = evaluateLhs(localStmtCtx); - if (lhsIsDevice || rhsIsDevice) { - genCUDADataTransfer(builder, loc, lhsIsDevice, lhs, rhsIsDevice, rhs); - } else { + if (isCUDATransfer && !hasCUDAImplicitTransfer) + genCUDADataTransfer(builder, loc, assign, lhs, rhs); + else builder.create(loc, rhs, lhs, isWholeAllocatableAssignment, keepLhsLengthInAllocatableAssignment); + if (hasCUDAImplicitTransfer) { + localSymbols.popScope(); + for (mlir::Value temp : implicitTemps) + builder.create(loc, temp); } return; } diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 54226b8623e6a..4ebd736315bcb 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -2,6 +2,12 @@ ! Test CUDA Fortran data transfer using assignment statements. +module mod1 + type :: t1 + integer :: i + end type +end + subroutine sub1() integer, device :: m integer, device :: adev(10) @@ -55,3 +61,61 @@ end ! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>, i1) ! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.ref> ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref>, i1 + +subroutine sub2() + integer, device :: m + integer, device :: adev(10), bdev(10) + integer :: i, ahost(10), bhost(10) + + ahost = adev + + i = m + + ahost(1:5) = adev(1:5) + + bdev = adev + + ! Implicit data transfer of adev before evaluation. + bhost = ahost + adev + +end + +! CHECK-LABEL: func.func @_QPsub2() +! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda, uniq_name = "_QFsub2Eadev"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Eahost"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[BDEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda, uniq_name = "_QFsub2Ebdev"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Ebhost"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub2Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda, uniq_name = "_QFsub2Em"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[AHOST]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.ref> +! CHECK: fir.cuda_data_transfer %[[M]]#0 to %[[I]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref, !fir.ref + +! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.ref>, index, index, index, !fir.shape<1>) -> !fir.ref> +! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.ref>, index, index, index, !fir.shape<1>) -> !fir.ref> +! CHECK: fir.cuda_data_transfer %[[DES_ADEV]] to %[[DES_AHOST]] {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.ref> + +! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[BDEV]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.ref> + +! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""} +! CHECK: %[[DECL_TEMP:.*]]:2 = hlfir.declare %[[TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap>, !fir.shape<1>) -> (!fir.heap>, !fir.heap>) +! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %21#0 {cuda_attr = #fir.cuda, uniq_name = "_QFsub2Eadev"} : (!fir.heap>) -> (!fir.heap>, !fir.heap>) +! CHECK: fir.cuda_data_transfer %[[ADEV]]#1 to %[[DECL_TEMP]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.heap> +! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> +! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[BHOST]]#0 : !hlfir.expr<10xi32>, !fir.ref> +! CHECK: fir.freemem %[[DECL_TEMP]]#0 : !fir.heap> + +subroutine sub3() + use mod1 + type(t1), device :: t + integer :: ahost(10), bhost(10) + + bhost = ahost + t%i +end + +! CHECK-LABEL: func.func @_QPsub3() +! CHECK: %[[TMP:.*]] = fir.alloca !fir.type<_QMmod1Tt1{i:i32}> {bindc_name = ".tmp"} +! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Eahost"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Ebhost"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[T:.*]]:2 = hlfir.declare %7 {cuda_attr = #fir.cuda, uniq_name = "_QFsub3Et"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %0 {uniq_name = ".tmp"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +! CHECK: fir.cuda_data_transfer %[[T]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #fir.cuda_transfer} : !fir.ref>, !fir.ref>