Skip to content

Commit

Permalink
[flang][cuda] Lower ALLOCATE for device variable (#88980)
Browse files Browse the repository at this point in the history
Replace the runtime call to `AllocatableAllocate` for CUDA device
variable to the newly added `fir.cuda_allocate` operation.
  • Loading branch information
clementval committed Apr 17, 2024
1 parent a88ea8f commit da70f2c
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 10 deletions.
57 changes: 47 additions & 10 deletions flang/lib/Lower/Allocatable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "flang/Evaluate/tools.h"
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
Expand Down Expand Up @@ -368,20 +369,17 @@ class AllocateStmtHelper {
[&](const Fortran::parser::AllocOpt::Mold &mold) {
moldExpr = Fortran::semantics::GetExpr(mold.v.value());
},
[&](const Fortran::parser::AllocOpt::Stream &) {
TODO(loc, "CUDA ALLOCATE(STREAM=)");
[&](const Fortran::parser::AllocOpt::Stream &stream) {
streamExpr = Fortran::semantics::GetExpr(stream.v.value());
},
[&](const Fortran::parser::AllocOpt::Pinned &) {
TODO(loc, "CUDA ALLOCATE(PINNED=)");
[&](const Fortran::parser::AllocOpt::Pinned &pinned) {
pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
},
},
allocOption.u);
}

void lowerAllocation(const Allocation &alloc) {
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
TODO(loc, "Allocation of variable with CUDA attributes");

fir::MutableBoxValue boxAddr =
genMutableBoxValue(converter, loc, alloc.getAllocObj());

Expand Down Expand Up @@ -456,7 +454,8 @@ class AllocateStmtHelper {
const fir::MutableBoxValue &box) {
if (!box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
!useAllocateRuntime && !box.isPointer()) {
!useAllocateRuntime && !box.isPointer() &&
!Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
Expand All @@ -472,7 +471,12 @@ class AllocateStmtHelper {
genSetType(alloc, box, loc);
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
mlir::Value stat;
if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
stat = genRuntimeAllocate(builder, loc, box, errorManager);
else
stat =
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
postAllocationAction(alloc);
errorManager.assignStat(builder, loc, stat);
Expand Down Expand Up @@ -602,7 +606,10 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
if (isSource)
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
stat =
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
else if (isSource)
stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
else
stat = genRuntimeAllocate(builder, loc, box, errorManager);
Expand Down Expand Up @@ -717,13 +724,43 @@ class AllocateStmtHelper {
return nullptr;
}

mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
const fir::MutableBoxValue &box,
ErrorManager &errorManager,
const Fortran::semantics::Symbol &sym) {
Fortran::lower::StatementContext stmtCtx;
fir::CUDADataAttributeAttr cudaAttr =
Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
sym);
mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
mlir::Value stream =
streamExpr
? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
: nullptr;
mlir::Value pinned =
pinnedExpr
? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
: nullptr;
mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;

// Keep return type the same as a standard AllocatableAllocate call.
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
return builder
.create<fir::CUDAAllocateOp>(
loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
.getResult();
}

Fortran::lower::AbstractConverter &converter;
fir::FirOpBuilder &builder;
const Fortran::parser::AllocateStmt &stmt;
const Fortran::lower::SomeExpr *sourceExpr{nullptr};
const Fortran::lower::SomeExpr *moldExpr{nullptr};
const Fortran::lower::SomeExpr *statExpr{nullptr};
const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
const Fortran::lower::SomeExpr *streamExpr{nullptr};
// If the allocate has a type spec, lenParams contains the
// value of the length parameters that were specified inside.
llvm::SmallVector<mlir::Value> lenParams;
Expand Down
107 changes: 107 additions & 0 deletions flang/test/Lower/CUDA/cuda-allocatable.cuf
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s

! Test lowering of CUDA allocatable allocate/deallocate statements.

subroutine sub1()
real, allocatable, device :: a(:)
allocate(a(10))
end subroutine

! CHECK-LABEL: func.func @_QPsub1()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub2()
real, allocatable, managed :: a(:)
integer :: istat
allocate(a(10), stat=istat)
end subroutine

! CHECK-LABEL: func.func @_QPsub2()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>

subroutine sub3()
integer, allocatable, pinned :: a(:,:)
logical :: plog
allocate(a(20,30), pinned = plog)
end subroutine

! CHECK-LABEL: func.func @_QPsub3()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
! CHECK-2: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32

subroutine sub4()
real, allocatable, unified :: a(:)
integer :: istream
allocate(a(10), stream=istream)
end subroutine

! CHECK-LABEL: func.func @_QPsub4()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32

subroutine sub5()
real, allocatable, device :: a(:)
real, allocatable :: b(:)
allocate(a, source=b)
end subroutine

! CHECK-LABEL: func.func @_QPsub5()
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub6()
real, allocatable, device :: a(:)
real, allocatable :: b(:)
allocate(a, mold=b)
end subroutine

! CHECK-LABEL: func.func @_QPsub6()
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableApplyMold
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub7()
real, allocatable, device :: a(:)
integer :: istat
character(50) :: err
allocate(a(100), stat=istat, errmsg=err)
end subroutine

! CHECK-LABEL: func.func @_QPsub7()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>

0 comments on commit da70f2c

Please sign in to comment.