From 305390953742920a943bb48187f35d15c2148a1d Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 11 Jul 2024 11:51:43 -0700 Subject: [PATCH 1/3] [flang][cuda] Use cuf.alloc/cuf.free for local descriptor holding cuda data --- flang/include/flang/Semantics/tools.h | 8 ++------ flang/lib/Lower/ConvertVariable.cpp | 8 ++++++-- flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 6 ++++-- flang/test/Fir/cuf-invalid.fir | 11 +---------- flang/test/Lower/CUDA/cuda-allocatable.cuf | 14 +++++++------- 5 files changed, 20 insertions(+), 27 deletions(-) diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 0b5308d9242de..0fcba3131fad1 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -222,7 +222,6 @@ inline bool HasCUDAAttr(const Symbol &sym) { } inline bool NeedCUDAAlloc(const Symbol &sym) { - bool inDeviceSubprogram{IsCUDADeviceContext(&sym.owner())}; if (IsDummy(sym)) { return false; } @@ -230,11 +229,8 @@ inline bool NeedCUDAAlloc(const Symbol &sym) { if (details->cudaDataAttr() && (*details->cudaDataAttr() == common::CUDADataAttr::Device || *details->cudaDataAttr() == common::CUDADataAttr::Managed || - *details->cudaDataAttr() == common::CUDADataAttr::Unified)) { - // Descriptor is allocated on host when in host context. - if (IsAllocatable(sym)) { - return inDeviceSubprogram; - } + *details->cudaDataAttr() == common::CUDADataAttr::Unified || + *details->cudaDataAttr() == common::CUDADataAttr::Pinned)) { return true; } } diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index e5a71c5ec5b4a..b239b70e7f5c1 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -715,8 +715,12 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter, auto idxTy = builder.getIndexType(); for (mlir::Value sh : elidedShape) indices.push_back(builder.createConvert(loc, idxTy, sh)); - return builder.create(loc, ty, nm, symNm, dataAttr, lenParams, - indices); + mlir::Value alloc = builder.create( + loc, ty, nm, symNm, dataAttr, lenParams, indices); + converter.getFctCtx().attachCleanup([&builder, loc, alloc, dataAttr]() { + builder.create(loc, alloc, dataAttr); + }); + return alloc; } // Let the builder do all the heavy lifting. diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp index 53092bed5720b..f7b36b208a7de 100644 --- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp +++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp @@ -54,9 +54,11 @@ template static llvm::LogicalResult checkCudaAttr(Op op) { if (op.getDataAttr() == cuf::DataAttribute::Device || op.getDataAttr() == cuf::DataAttribute::Managed || - op.getDataAttr() == cuf::DataAttribute::Unified) + op.getDataAttr() == cuf::DataAttribute::Unified || + op.getDataAttr() == cuf::DataAttribute::Pinned) return mlir::success(); - return op.emitOpError("expect device, managed or unified cuda attribute"); + return op.emitOpError() + << "expect device, managed, pinned or unified cuda attribute"; } llvm::LogicalResult cuf::AllocOp::verify() { return checkCudaAttr(*this); } diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index 6e18e48ac82fc..06e08d14b2435 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -88,18 +88,9 @@ func.func @_QPsub1() { // ----- -func.func @_QPsub1() { - // expected-error@+1{{'cuf.alloc' op expect device, managed or unified cuda attribute}} - %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda, uniq_name = "_QFsub1Er"} -> !fir.ref - cuf.free %0 : !fir.ref {data_attr = #cuf.cuda} - return -} - -// ----- - func.func @_QPsub1() { %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda, uniq_name = "_QFsub1Er"} -> !fir.ref - // expected-error@+1{{'cuf.free' op expect device, managed or unified cuda attribute}} + // expected-error@+1{{'cuf.free' op expect device, managed, pinned or unified cuda attribute}} cuf.free %0 : !fir.ref {data_attr = #cuf.cuda} return } diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 74a3ec100a8f2..1e75b45a38dfa 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -10,7 +10,7 @@ subroutine sub1() end subroutine ! CHECK-LABEL: func.func @_QPsub1() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub1Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: fir.call @_FortranAAllocatableSetBounds ! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 @@ -35,7 +35,7 @@ subroutine sub2() end subroutine ! CHECK-LABEL: func.func @_QPsub2() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub2Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub2Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"} ! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -57,7 +57,7 @@ subroutine sub3() end subroutine ! CHECK-LABEL: func.func @_QPsub3() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub3Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub3Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub3Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"} ! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) @@ -74,7 +74,7 @@ subroutine sub4() end subroutine ! CHECK-LABEL: func.func @_QPsub4() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub4Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub4Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub4Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"} ! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -92,7 +92,7 @@ subroutine sub5() end subroutine ! CHECK-LABEL: func.func @_QPsub5() -! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub5Ea"} +! CHECK: %[[BOX_A:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub5Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub5Eb"} ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) @@ -112,7 +112,7 @@ subroutine sub6() end subroutine ! CHECK-LABEL: func.func @_QPsub6() -! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub6Ea"} +! CHECK: %[[BOX_A:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub6Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub6Eb"} ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) @@ -133,7 +133,7 @@ subroutine sub7() end subroutine ! CHECK-LABEL: func.func @_QPsub7() -! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub7Ea"} +! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFsub7Ea"} -> !fir.ref>>> ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub7Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"} ! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) From 4b0b77fcabe2bcc17fc34f45282a7b12fcf605e9 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 11 Jul 2024 13:14:05 -0700 Subject: [PATCH 2/3] Make sure cuf.free is placed after conditional deallocation of the data --- flang/lib/Lower/ConvertVariable.cpp | 33 +++++++++++----------- flang/test/Lower/CUDA/cuda-allocatable.cuf | 7 +++++ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index b239b70e7f5c1..3156e64112972 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -717,9 +717,9 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter, indices.push_back(builder.createConvert(loc, idxTy, sh)); mlir::Value alloc = builder.create( loc, ty, nm, symNm, dataAttr, lenParams, indices); - converter.getFctCtx().attachCleanup([&builder, loc, alloc, dataAttr]() { - builder.create(loc, alloc, dataAttr); - }); + // converter.getFctCtx().attachCleanup([&builder, loc, alloc, dataAttr]() { + // builder.create(loc, alloc, dataAttr); + // }); return alloc; } @@ -931,6 +931,19 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter, finalizeAtRuntime(converter, var, symMap); if (mustBeDefaultInitializedAtRuntime(var)) defaultInitializeAtRuntime(converter, var, symMap); + if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) { + auto *builder = &converter.getFirOpBuilder(); + mlir::Location loc = converter.getCurrentLocation(); + fir::ExtendedValue exv = + converter.getSymbolExtendedValue(var.getSymbol(), &symMap); + auto *sym = &var.getSymbol(); + converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() { + cuf::DataAttributeAttr dataAttr = + Fortran::lower::translateSymbolCUFDataAttribute(builder->getContext(), + *sym); + builder->create(loc, fir::getBase(exv), dataAttr); + }); + } if (std::optional cleanup = needDeallocationOrFinalization(var)) { auto *builder = &converter.getFirOpBuilder(); @@ -954,22 +967,10 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter, "trying to deallocate entity not lowered as allocatable"); Fortran::lower::genDeallocateIfAllocated(*converterPtr, *mutableBox, loc, sym); + }); } } - if (Fortran::semantics::NeedCUDAAlloc(var.getSymbol())) { - auto *builder = &converter.getFirOpBuilder(); - mlir::Location loc = converter.getCurrentLocation(); - fir::ExtendedValue exv = - converter.getSymbolExtendedValue(var.getSymbol(), &symMap); - auto *sym = &var.getSymbol(); - converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() { - cuf::DataAttributeAttr dataAttr = - Fortran::lower::translateSymbolCUFDataAttribute(builder->getContext(), - *sym); - builder->create(loc, fir::getBase(exv), dataAttr); - }); - } } //===----------------------------------------------------------------===// diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 1e75b45a38dfa..82c1063507535 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -25,6 +25,7 @@ end subroutine ! CHECK: fir.if %[[NE_C0]] { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub2() real, allocatable, managed :: a(:) @@ -49,6 +50,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub3() integer, allocatable, pinned :: a(:,:) @@ -66,6 +68,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub4() real, allocatable, device :: a(:) @@ -84,6 +87,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub5() real, allocatable, device :: a(:) @@ -104,6 +108,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub6() real, allocatable, device :: a(:) @@ -122,6 +127,7 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_A_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} subroutine sub7() real, allocatable, device :: a(:) @@ -150,3 +156,4 @@ end subroutine ! CHECK: fir.if %{{.*}} { ! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} -> i32 ! CHECK: } +! CHECK: cuf.free %[[BOX_DECL]]#1 : !fir.ref>>> {data_attr = #cuf.cuda} From 3fc607c9e5da968d470f5c76beccb6e4f3087f77 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 11 Jul 2024 13:22:50 -0700 Subject: [PATCH 3/3] Remove commented code --- flang/lib/Lower/ConvertVariable.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 3156e64112972..47ad48fb322cc 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -717,9 +717,6 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter, indices.push_back(builder.createConvert(loc, idxTy, sh)); mlir::Value alloc = builder.create( loc, ty, nm, symNm, dataAttr, lenParams, indices); - // converter.getFctCtx().attachCleanup([&builder, loc, alloc, dataAttr]() { - // builder.create(loc, alloc, dataAttr); - // }); return alloc; }