diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp index d433ce367d259..c6c428860bca1 100644 --- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp @@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper { loadedMoldArg = builder.loadIfRef(loc, moldArg); return loadedMoldArg; } + + bool shouldAllocateTempOnStack() const; }; } // namespace @@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front()); } - mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{}, - /*shape=*/{}, lenParams); + bool shouldAllocateOnStack = shouldAllocateTempOnStack(); + mlir::Value valAlloc = + (shouldAllocateOnStack) + ? builder.createTemporary(loc, innerTy, /*name=*/{}, + /*shape=*/{}, lenParams) + : builder.createHeapTemporary(loc, innerTy, /*name=*/{}, + /*shape=*/{}, lenParams); + if (scalarInitValue) builder.createStoreWithConvert(loc, scalarInitValue, valAlloc); mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc, @@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( fir::StoreOp lastOp = fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg); - createCleanupRegion(converter, loc, argType, cleanupRegion, sym, - isDoConcurrent); + if (!shouldAllocateOnStack) + createCleanupRegion(converter, loc, argType, cleanupRegion, sym, + isDoConcurrent); if (ifUnallocated) builder.setInsertionPointAfter(ifUnallocated); @@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar( createYield(allocatedPrivVarArg); } +bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const { + // On the GPU, always allocate on the stack since heap allocatins are very + // expensive. + auto offloadMod = + llvm::dyn_cast(*builder.getModule()); + return offloadMod && offloadMod.getIsGPU(); +} + void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( fir::BaseBoxType boxTy, bool needsInitialization) { bool isAllocatableOrPointer = @@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Allocating on the heap in case the whole reduction/privatization is nested // inside of a loop auto temp = [&]() { - bool shouldAllocateOnStack = false; - - // On the GPU, always allocate on the stack since heap allocatins are very - // expensive. - if (auto offloadMod = llvm::dyn_cast( - *builder.getModule())) - shouldAllocateOnStack = offloadMod.getIsGPU(); - - if (shouldAllocateOnStack) + if (shouldAllocateTempOnStack()) return createStackTempFromMold(loc, builder, source); auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 index 3d93fbc6e446e..272f34fc0fd1a 100644 --- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 +++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-allocatable.f90 @@ -1,9 +1,22 @@ ! Tests delayed privatization for `targets ... private(..)` for allocatables. ! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \ -! RUN: -o - %s 2>&1 | FileCheck %s +! RUN: -o - %s 2>&1 | FileCheck %s --check-prefix=CPU + ! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \ -! RUN: | FileCheck %s +! RUN: | FileCheck %s --check-prefix=CPU + +! RUN: %if amdgpu-registered-target %{ \ +! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \ +! RUN: -fopenmp -fopenmp-is-target-device \ +! RUN: -mmlir --enable-delayed-privatization-staging \ +! RUN: -o - %s 2>&1 | \ +! RUN: FileCheck %s --check-prefix=GPU \ +! RUN: %} + +! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \ +! RUN: -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=GPU subroutine target_allocatable implicit none @@ -14,53 +27,65 @@ subroutine target_allocatable !$omp end target end subroutine target_allocatable -! CHECK-LABEL: omp.private {type = private} -! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : -! CHECK-SAME: [[DESC_TYPE:!fir.box>]] init { -! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]): +! CPU-LABEL: omp.private {type = private} +! CPU-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : +! CPU-SAME: [[DESC_TYPE:!fir.box>]] init { +! CPU: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]): + +! CPU-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]] +! CPU-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap +! CPU-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap) -> i64 +! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64 +! CPU-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64 -! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]] -! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap -! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap) -> i64 -! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64 -! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64 +! CPU-NEXT: fir.if %[[ALLOC_COND]] { +! CPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap) -> [[DESC_TYPE]] +! CPU-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] +! CPU-NEXT: } else { +! CPU-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 +! CPU-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> [[DESC_TYPE]] +! CPU-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] +! CPU-NEXT: } -! CHECK-NEXT: fir.if %[[ALLOC_COND]] { -! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap) -> [[DESC_TYPE]] -! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] -! CHECK-NEXT: } else { -! CHECK-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 -! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap) -> [[DESC_TYPE]] -! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]] -! CHECK-NEXT: } +! CPU-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]]) -! CHECK-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]]) +! CPU-NEXT: } dealloc { +! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]): -! CHECK-NEXT: } dealloc { -! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]): +! CPU-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]] +! CPU-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]] +! CPU-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]] +! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64 +! CPU-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64 -! CHECK-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]] -! CHECK-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]] -! CHECK-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]] -! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64 -! CHECK-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64 +! CPU-NEXT: fir.if %[[PRIV_NULL_COND]] { +! CPU-NEXT: fir.freemem %[[PRIV_ADDR]] +! CPU-NEXT: } -! CHECK-NEXT: fir.if %[[PRIV_NULL_COND]] { -! CHECK-NEXT: fir.freemem %[[PRIV_ADDR]] -! CHECK-NEXT: } +! CPU-NEXT: omp.yield +! CPU-NEXT: } -! CHECK-NEXT: omp.yield -! CHECK-NEXT: } +! CPU-LABEL: func.func @_QPtarget_allocatable() { -! CHECK-LABEL: func.func @_QPtarget_allocatable() { +! CPU: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]] +! CPU-SAME: {bindc_name = "alloc_var", {{.*}}} +! CPU: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] +! CPU: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref>>) -> [[MEMBER_TYPE:.*]] +! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} +! CPU: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> -! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]] -! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}} -! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]] -! CHECK: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref>>) -> [[MEMBER_TYPE:.*]] -! CHECK: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}} -! CHECK: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> +! CPU: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private( +! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) { -! CHECK: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private( -! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) { +! GPU-LABEL: omp.private {type = private} {{.*}} init { +! GPU: fir.if %{{.*}} { +! GPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %{{.*}} +! GPU-NEXT: fir.store %[[ZERO_BOX]] to %{{.*}} +! GPU-NEXT: } else { +! GPU-NOT: fir.allocmem i32 +! GPU-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca i32 +! GPU-NEXT: %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]] +! GPU-NEXT: fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}} +! GPU-NEXT: } +! GPU-NEXT: omp.yield(%{{.*}})