diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index fddeba98adccc..ed6ccab06181a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1784,8 +1784,8 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createReductionsGPU( - OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false, + TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); CGF.Builder.restoreIP(AfterIP); diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 58a317cf5d691..ff4dab1136ee9 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3743,7 +3743,8 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr:$byref_element_type); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp index 605a5b6b20b94..1bce6b3b3c832 100644 --- a/flang/lib/Lower/Support/ReductionProcessor.cpp +++ b/flang/lib/Lower/Support/ReductionProcessor.cpp @@ -573,10 +573,18 @@ OpType ReductionProcessor::createDeclareReduction( mlir::OpBuilder modBuilder(module.getBodyRegion()); mlir::Type valTy = fir::unwrapRefType(type); + // For by-ref reductions, we want to keep track of the + // boxed/referenced/allocated type. For example, a for `real, allocatable` + // variable, `real` should be stored. + mlir::TypeAttr boxedTy{}; + if (!isByRef) type = valTy; - decl = OpType::create(modBuilder, loc, reductionOpName, type); + if (isByRef) + boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy)); + + decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy); createReductionAllocAndInitRegions(converter, loc, decl, redId, type, isByRef); diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 1229018bd9b3e..11609ea7b6040 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -851,7 +851,8 @@ class DoConcurrentConversion if (!ompReducer) { ompReducer = mlir::omp::DeclareReductionOp::create( rewriter, firReducer.getLoc(), ompReducerName, - firReducer.getTypeAttr().getValue()); + firReducer.getTypeAttr().getValue(), + firReducer.getByrefElementTypeAttr()); cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(), ompReducer.getAllocRegion()); diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 index 4b6a643f94059..4c7b6ac5f5f9b 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 @@ -22,7 +22,7 @@ subroutine red_and_delayed_private ! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32 ! CHECK-LABEL: omp.declare_reduction -! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref alloc +! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref attributes {byref_element_type = i32} alloc ! CHECK-LABEL: _QPred_and_delayed_private ! CHECK: omp.parallel diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 index 41c7d69ebb3ba..f56875dcb518b 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref>>> attributes {byref_element_type = !fir.array} alloc { ! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box>> ! CHECK: omp.yield(%[[VAL_10]] : !fir.ref>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 index aa91e1e0e8b15..d9ba3bed464f8 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 @@ -12,7 +12,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_15]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 index 59595de338d50..636660f279e85 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 @@ -17,7 +17,7 @@ program reduce print *,i end program -! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { +! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> attributes {byref_element_type = !fir.array<3xi32>} alloc { ! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CPU: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CPU-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 index 14338c6f50817..9cf8a63427ed1 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 @@ -13,7 +13,7 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 index 36344458d1cae..3de2ba8f61f8e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 @@ -19,7 +19,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref>>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref>>> attributes {byref_element_type = !fir.array} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90 index 9af18378f0ae0..da337378862be 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction3.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 index 8b94d51f986f5..4a0593ff9eca4 100644 --- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 +++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 @@ -9,7 +9,7 @@ subroutine max_array_reduction(l, r) !$omp end parallel end subroutine -! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90 index 2f2808cebfc0c..0dbe9e3673395 100644 --- a/flang/test/Lower/OpenMP/sections-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90 @@ -14,7 +14,7 @@ subroutine sectionsReduction(x) end subroutine -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 index 18a4f75b86309..3a63bb09c59de 100644 --- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 +++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 @@ -1,7 +1,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref>> {{.*}} alloc { ! [...] ! CHECK: omp.yield ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 index 2cd953de0dffa..ed81577ecce16 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 @@ -32,7 +32,7 @@ program reduce15 print *,"min: ", mins end program -! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) ! CHECK-LABEL: } init { @@ -93,7 +93,7 @@ program reduce15 ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref>>> alloc { +! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref>>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box>> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 index 663851cba46c6..d8c0a36db126e 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 @@ -18,7 +18,7 @@ program reduce end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref>> attributes {byref_element_type = i32} alloc { ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_2]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 index 209ee9a4e0cef..28acb8f19531f 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 @@ -22,7 +22,7 @@ subroutine reduce(r) end subroutine end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 index 2233a74600948..ec448cf20f111 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 @@ -11,7 +11,7 @@ program reduce !$omp end parallel do end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>>, %[[ARG1:.*]]: !fir.ref>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 index 211bde19da8db..9da05a290ec21 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 @@ -19,7 +19,7 @@ subroutine sub(a, lb, ub) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref>> {{.*}} alloc { ! CHECK: } combiner { ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref>>, %[[ARG1:.*]]: !fir.ref>>): ! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 index afaeba27c5eae..14b657c8e180d 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> attributes {byref_element_type = !fir.array<2xi32>} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 index 25b2e97a1b7f7..d0a0c38e4ccb1 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 @@ -14,7 +14,7 @@ program reduce print *,r end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 index edd2bcb1d6be8..60a162d8f8002 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 @@ -24,7 +24,7 @@ program main endprogram -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 index 27b726376fbeb..f640f5caddf76 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 @@ -18,7 +18,7 @@ program reduce_pointer deallocate(v) end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref>> alloc { +! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref>> {{.*}} alloc { ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box> ! CHECK: omp.yield(%[[VAL_3]] : !fir.ref>>) ! CHECK-LABEL: } init { diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 index 873fd10dd1b97..4fb67c094b594 100644 --- a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 +++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90 @@ -8,7 +8,7 @@ subroutine do_concurrent_allocatable end do end subroutine -! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc { +! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] attributes {byref_element_type = !fir.array} alloc { ! CHECK: %[[ALLOC:.*]] = fir.alloca ! CHECK: fir.yield(%[[ALLOC]] : ![[RED_TYPE]]) ! CHECK: } init { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 5331cb5abdc6f..dff376f3a22e4 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1448,17 +1448,22 @@ class OpenMPIRBuilder { ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, - ReductionGenAtomicCBTy AtomicReductionGen) + ReductionGenAtomicCBTy AtomicReductionGen, + Type *ByRefAllocatedType = nullptr, + Type *ByRefElementType = nullptr) : ElementType(ElementType), Variable(Variable), PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), - AtomicReductionGen(AtomicReductionGen) {} + AtomicReductionGen(AtomicReductionGen), + ByRefAllocatedType(ByRefAllocatedType), + ByRefElementType(ByRefElementType) {} ReductionInfo(Value *PrivateVariable) : ElementType(nullptr), Variable(nullptr), PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} - /// Reduction element type, must match pointee type of variable. + /// Reduction element type, must match pointee type of variable. For by-ref + /// reductions, this would be just an opaque `ptr`. Type *ElementType; /// Reduction variable of pointer type. @@ -1485,6 +1490,18 @@ class OpenMPIRBuilder { /// reduction. If null, the implementation will use the non-atomic version /// along with the appropriate synchronization mechanisms. ReductionGenAtomicCBTy AtomicReductionGen; + + /// For by-ref reductions, we need to keep track of 2 extra types that are + /// potentially different: + /// * The allocated type is the type of the storage allocated by the + /// reduction op's `alloc` region. For example, for allocatables and arrays, + /// this type would be the descriptor/box struct. + Type *ByRefAllocatedType; + /// * The by-ref element type is the type of the actual storage needed for + /// the data of the allocatable or array. For example, an float allocatable + /// of would need some float storage to store intermediate reduction + /// results. + Type *ByRefElementType; }; enum class CopyAction : unsigned { @@ -1529,14 +1546,15 @@ class OpenMPIRBuilder { /// Function to shuffle over the value from the remote lane. void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, - Type *ElementType, Value *Offset, - Type *ReductionArrayTy); + Type *ElementType, Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem); /// Emit instructions to copy a Reduce list, which contains partially /// aggregated values, in the specified direction. void emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef ReductionInfos, Value *SrcBase, Value *DestBase, + ArrayRef IsByRef, CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); /// Emit a helper that reduces data across two OpenMP threads (lanes) @@ -1610,11 +1628,13 @@ class OpenMPIRBuilder { /// \param ReduceFn The reduction function. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The ShuffleAndReduce function. Function *emitShuffleAndReduceFunction( ArrayRef ReductionInfos, - Function *ReduceFn, AttributeList FuncAttrs); + Function *ReduceFn, AttributeList FuncAttrs, ArrayRef IsByRef); /// Helper function for CreateCanonicalScanLoops to create InputLoop /// in the firstGen and Scan Loop in the SecondGen @@ -1674,12 +1694,14 @@ class OpenMPIRBuilder { /// \param ReductionInfos Array type containing the ReductionOps. /// \param FuncAttrs Optional param to specify any function attributes that /// need to be copied to the new function. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// /// \return The InterWarpCopy function. Expected emitInterWarpCopyFunction(const LocationDescription &Loc, ArrayRef ReductionInfos, - AttributeList FuncAttrs); + AttributeList FuncAttrs, ArrayRef IsByRef); /// This function emits a helper that copies all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1773,6 +1795,7 @@ class OpenMPIRBuilder { /// \return The reduction function. Expected createReductionFunction( StringRef ReducerName, ArrayRef ReductionInfos, + ArrayRef IsByRef, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, AttributeList FuncAttrs = {}); @@ -2025,11 +2048,14 @@ class OpenMPIRBuilder { /// reduction variables. /// \param AllocaIP An insertion point suitable for allocas usable /// in reductions. - /// \param CodeGenIP An insertion point suitable for code - /// generation. \param ReductionInfos A list of info on each reduction - /// variable. \param IsNoWait Optional flag set if the reduction is - /// marked as - /// nowait. + /// \param CodeGenIP An insertion point suitable for code + /// generation. + /// \param ReductionInfos A list of info on each reduction + /// variable. + /// \param IsNoWait Optional flag set if the reduction is + /// marked as nowait. + /// \param IsByRef For each reduction clause, whether the reduction is by-ref + /// or not. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. /// \param GridValue Optional GPU grid value. @@ -2039,7 +2065,8 @@ class OpenMPIRBuilder { LLVM_ABI InsertPointOrErrorTy createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef ReductionInfos, - bool IsNoWait = false, bool IsTeamsReduction = false, + ArrayRef IsByRef, bool IsNoWait = false, + bool IsTeamsReduction = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0e5926ff0fb18..db25b6ea357ce 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2450,7 +2450,8 @@ Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, Type *ElemType, - Value *Offset, Type *ReductionArrayTy) { + Value *Offset, Type *ReductionArrayTy, + bool IsByRefElem) { uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); // Create the loop over the big sized data. // ptr = (void*)Elem; @@ -2535,7 +2536,7 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, void OpenMPIRBuilder::emitReductionListCopy( InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, ArrayRef ReductionInfos, Value *SrcBase, Value *DestBase, - CopyOptionsTy CopyOptions) { + ArrayRef IsByRef, CopyOptionsTy CopyOptions) { Type *IndexTy = Builder.getIndexTy( M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; @@ -2545,6 +2546,7 @@ void OpenMPIRBuilder::emitReductionListCopy( for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); Value *SrcElementAddr = nullptr; + AllocaInst *DestAlloca = nullptr; Value *DestElementAddr = nullptr; Value *DestElementPtrAddr = nullptr; // Should we shuffle in an element from a remote lane? @@ -2564,14 +2566,18 @@ void OpenMPIRBuilder::emitReductionListCopy( DestElementPtrAddr = Builder.CreateInBoundsGEP( ReductionArrayTy, DestBase, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]); switch (Action) { case CopyAction::RemoteLaneToThread: { InsertPointTy CurIP = Builder.saveIP(); Builder.restoreIP(AllocaIP); - AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, - ".omp.reduction.element"); + + Type *DestAllocaType = + IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType; + DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr, + ".omp.reduction.element"); DestAlloca->setAlignment( - M.getDataLayout().getPrefTypeAlign(RI.ElementType)); + M.getDataLayout().getPrefTypeAlign(DestAllocaType)); DestElementAddr = DestAlloca; DestElementAddr = Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), @@ -2591,8 +2597,49 @@ void OpenMPIRBuilder::emitReductionListCopy( // Now that all active lanes have read the element in the // Reduce list, shuffle over the value from the remote lane. if (ShuffleInElement) { - shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, - RemoteLaneOffset, ReductionArrayTy); + Type *ShuffleType = RI.ElementType; + Value *ShuffleSrcAddr = SrcElementAddr; + Value *ShuffleDestAddr = DestElementAddr; + Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0); + AllocaInst *LocalStorage = nullptr; + + if (IsByRefElem) { + assert(RI.ByRefElementType && "Expected by-ref element type to be set"); + assert(RI.ByRefAllocatedType && + "Expected by-ref allocated type to be set"); + // For by-ref reductions, we need to copy from the remote lane the + // actual value of the partial reduction computed by that remote lane; + // rather than, for example, a pointer to that data or, even worse, a + // pointer to the descriptor of the by-ref reduction element. + ShuffleType = RI.ByRefElementType; + + ShuffleSrcAddr = Builder.CreateGEP(RI.ByRefAllocatedType, + ShuffleSrcAddr, {Zero, Zero}); + ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr); + + { + auto OldIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + + LocalStorage = Builder.CreateAlloca(ShuffleType); + Builder.restoreIP(OldIP); + ShuffleDestAddr = LocalStorage; + } + } + + shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType, + RemoteLaneOffset, ReductionArrayTy, IsByRefElem); + + if (IsByRefElem) { + auto *GEP = + Builder.CreateGEP(RI.ByRefAllocatedType, + Builder.CreatePointerBitCastOrAddrSpaceCast( + DestAlloca, Builder.getPtrTy(), ".ascast"), + {Zero, Zero}); + Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast( + LocalStorage, Builder.getPtrTy(), ".ascast"), + GEP); + } } else { switch (RI.EvaluationKind) { case EvalKind::Scalar: { @@ -2647,7 +2694,7 @@ void OpenMPIRBuilder::emitReductionListCopy( Expected OpenMPIRBuilder::emitInterWarpCopyFunction( const LocationDescription &Loc, ArrayRef ReductionInfos, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef IsByRef) { InsertPointTy SavedIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( @@ -2728,7 +2775,9 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( // memory. // const ReductionInfo &RI = En.value(); - unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType); + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + unsigned RealTySize = M.getDataLayout().getTypeAllocSize( + IsByRefElem ? RI.ByRefElementType : RI.ElementType); for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) { Type *CType = Builder.getIntNTy(TySize * 8); @@ -2791,6 +2840,15 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( ConstantInt::get(IndexTy, En.index())}); // elemptr = ((CopyType*)(elemptrptr)) + I Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + + if (IsByRefElem) { + Type *Int32Ty = Builder.getInt32Ty(); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + ElemPtr = + Builder.CreateGEP(RI.ByRefAllocatedType, ElemPtr, {Zero, Zero}); + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); + } + if (NumIters > 1) ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt); @@ -2846,6 +2904,15 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( Value *TargetElemPtrVal = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr); Value *TargetElemPtr = TargetElemPtrVal; + + if (IsByRefElem) { + Type *Int32Ty = Builder.getInt32Ty(); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + TargetElemPtr = Builder.CreateGEP(RI.ByRefAllocatedType, TargetElemPtr, + {Zero, Zero}); + TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr); + } + if (NumIters > 1) TargetElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt); @@ -2882,7 +2949,7 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef IsByRef) { LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get(Builder.getVoidTy(), @@ -2961,9 +3028,10 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( // This loop iterates through the list of reduce elements and copies, // element by element, from a remote lane in the warp to RemoteReduceList, // hosted on the thread's stack. - emitReductionListCopy( - AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, - ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr}); + emitReductionListCopy(AllocaIP, CopyAction::RemoteLaneToThread, + RedListArrayTy, ReductionInfos, ReduceList, + RemoteListAddrCast, IsByRef, + {RemoteLaneOffset, nullptr, nullptr}); // The actions to be performed on the Remote Reduce list is dependent // on the algorithm version. @@ -3032,7 +3100,8 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, - ReductionInfos, RemoteListAddrCast, ReduceList); + ReductionInfos, RemoteListAddrCast, ReduceList, + IsByRef); Builder.CreateBr(CpyMergeBB); emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); @@ -3437,7 +3506,8 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { Expected OpenMPIRBuilder::createReductionFunction( StringRef ReducerName, ArrayRef ReductionInfos, - ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { + ArrayRef IsByRef, ReductionGenCBKind ReductionGenCBKind, + AttributeList FuncAttrs) { auto *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, /* IsVarArg */ false); @@ -3498,8 +3568,14 @@ Expected OpenMPIRBuilder::createReductionFunction( LHSPtrs.emplace_back(LHSPtr); RHSPtrs.emplace_back(RHSPtr); } else { - Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); - Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *LHS = LHSPtr; + Value *RHS = RHSPtr; + + if (!IsByRef.empty() && !IsByRef[En.index()]) { + LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + } + Value *Reduced; InsertPointOrErrorTy AfterIP = RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); @@ -3509,7 +3585,9 @@ Expected OpenMPIRBuilder::createReductionFunction( return ReductionFunc; Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHSPtr); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, LHSPtr); } } @@ -3562,9 +3640,9 @@ checkReductionInfos(ArrayRef ReductionInfos, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef ReductionInfos, - bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind, - std::optional GridValue, unsigned ReductionBufNum, - Value *SrcLocInfo) { + ArrayRef IsByRef, bool IsNoWait, bool IsTeamsReduction, + ReductionGenCBKind ReductionGenCBKind, std::optional GridValue, + unsigned ReductionBufNum, Value *SrcLocInfo) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); @@ -3600,9 +3678,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr); CodeGenIP = Builder.saveIP(); - Expected ReductionResult = - createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(), - ReductionInfos, ReductionGenCBKind, FuncAttrs); + Expected ReductionResult = createReductionFunction( + Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef, + ReductionGenCBKind, FuncAttrs); if (!ReductionResult) return ReductionResult.takeError(); Function *ReductionFunc = *ReductionResult; @@ -3641,15 +3719,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( Value *ElemPtr = Builder.CreateInBoundsGEP( RedArrayTy, ReductionList, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + + auto *PrviateVar = RI.PrivateVariable; + bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()]; + if (IsByRefElem) + PrviateVar = Builder.CreateLoad(RI.ElementType, PrviateVar); + Value *CastElem = - Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); + Builder.CreatePointerBitCastOrAddrSpaceCast(PrviateVar, PtrTy); Builder.CreateStore(CastElem, ElemPtr); } CodeGenIP = Builder.saveIP(); - Function *SarFunc = - emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs); + Function *SarFunc = emitShuffleAndReduceFunction( + ReductionInfos, ReductionFunc, FuncAttrs, IsByRef); Expected CopyResult = - emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs); + emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef); if (!CopyResult) return CopyResult.takeError(); Function *WcFunc = *CopyResult; @@ -3728,7 +3812,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Add emission of __kmpc_end_reduce{_nowait}(); for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); - Value *LHS = RI.Variable; + Type *ValueType = RI.ElementType; + Value *RedValue = RI.Variable; Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); @@ -3739,7 +3824,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( // Fix the CallBack code genereated to use the correct Values for the LHS // and RHS - LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) { + LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) { return cast(U.getUser())->getParent()->getParent() == ReductionFunc; }); @@ -3748,15 +3833,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( ReductionFunc; }); } else { - Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs"); - Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs"); + if (IsByRef.empty() || !IsByRef[En.index()]) { + RedValue = Builder.CreateLoad(ValueType, RI.Variable, + "red.value." + Twine(En.index())); + } + Value *PrivateRedValue = Builder.CreateLoad( + ValueType, RHS, "red.private.value" + Twine(En.index())); Value *Reduced; InsertPointOrErrorTy AfterIP = - RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); + RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced); if (!AfterIP) return AfterIP.takeError(); Builder.restoreIP(*AfterIP); - Builder.CreateStore(Reduced, LHS, false); + + if (!IsByRef.empty() && !IsByRef[En.index()]) + Builder.CreateStore(Reduced, RI.Variable); } } emitBlock(ExitBB, CurFunc); @@ -3857,7 +3948,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions( assert(ReductionInfos.size() == IsByRef.size()); if (Config.isGPU()) return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos, - IsNoWait, IsTeamsReduction); + IsByRef, IsNoWait, IsTeamsReduction); checkReductionInfos(ReductionInfos, /*IsGPU*/ false); diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 377f1febf6b8f..386174a36d52c 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -2011,7 +2011,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove, }]; let arguments = (ins SymbolNameAttr:$sym_name, - TypeAttr:$type); + TypeAttr:$type, + OptionalAttr:$byref_element_type + ); let regions = (region MaxSizedRegion<1>:$allocRegion, AnyRegion:$initializerRegion, diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp index 460595ba9f254..6423d49859c97 100644 --- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp +++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp @@ -188,7 +188,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable, OpBuilder::InsertionGuard guard(builder); Type type = reduce.getOperands()[reductionIndex].getType(); auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(), - "__scf_reduction", type); + "__scf_reduction", type, + /*byref_element_type=*/{}); symbolTable.insert(decl); builder.createBlock(&decl.getInitializerRegion(), diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8edec990eaaba..d0852b52f4193 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1311,7 +1311,8 @@ static void collectReductionInfo( SmallVectorImpl &owningReductionGens, SmallVectorImpl &owningAtomicReductionGens, const ArrayRef privateReductionVariables, - SmallVectorImpl &reductionInfos) { + SmallVectorImpl &reductionInfos, + ArrayRef isByRef) { unsigned numReductions = loop.getNumReductionVars(); for (unsigned i = 0; i < numReductions; ++i) { @@ -1329,12 +1330,27 @@ static void collectReductionInfo( atomicGen = owningAtomicReductionGens[i]; llvm::Value *variable = moduleTranslation.lookupValue(loop.getReductionVars()[i]); + mlir::Type allocatedType; + reductionDecls[i].getAllocRegion().walk([&](mlir::Operation *op) { + if (auto alloca = mlir::dyn_cast(op)) { + allocatedType = alloca.getElemType(); + return mlir::WalkResult::interrupt(); + } + + return mlir::WalkResult::advance(); + }); + reductionInfos.push_back( {moduleTranslation.convertType(reductionDecls[i].getType()), variable, privateReductionVariables[i], /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar, owningReductionGens[i], - /*ReductionGenClang=*/nullptr, atomicGen}); + /*ReductionGenClang=*/nullptr, atomicGen, + allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr, + reductionDecls[i].getByrefElementType() + ? moduleTranslation.convertType( + *reductionDecls[i].getByrefElementType()) + : nullptr}); } } @@ -1400,7 +1416,7 @@ static LogicalResult createReductionsAndCleanup( // ReductionInfo only accepts references to the generators. collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + privateReductionVariables, reductionInfos, isByRef); // The call to createReductions below expects the block to have a // terminator. Create an unreachable instruction to serve as terminator @@ -2732,7 +2748,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, SmallVector reductionInfos; collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, - privateReductionVariables, reductionInfos); + privateReductionVariables, reductionInfos, isByRef); // Move to region cont block builder.SetInsertPoint((*regionBlock)->getTerminator()); diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir new file mode 100644 index 0000000000000..af3f5e68b6ddb --- /dev/null +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -0,0 +1,92 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QFfooEi_private_i32 : i32 + omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg1 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %7 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr + %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + %12 = llvm.load %9 : !llvm.ptr -> f32 + %13 = llvm.load %11 : !llvm.ptr -> f32 + %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %14, %9 : f32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } + llvm.func @foo_() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"} + omp.target map_entries(%10 -> %arg0 : !llvm.ptr) { + %13 = llvm.mlir.constant(1000 : i32) : i32 + %14 = llvm.mlir.constant(1 : i32) : i32 + omp.parallel { + omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg4 : !llvm.ptr) { + omp.loop_nest (%arg5) : i32 = (%14) to (%13) inclusive step (%14) { + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} { +// CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5) +// CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5) +// CHECK: %[[RED_ELEM_1:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr + +// CHECK: %[[SHUFFLE_ELEM:.*]] = alloca float, align 4, addrspace(5) +// CHECK: %[[REMOTE_RED_LIST_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[REMOTE_RED_LIST]] to ptr + +// CHECK: %[[REMOTE_RED_LIST_ELEM0:.*]] = getelementptr inbounds [1 x ptr], ptr %[[REMOTE_RED_LIST_ASCAST]], i64 0, i64 0 + +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: %[[SHUFFLE_RES:.*]] = call i32 @__kmpc_shuffle_int32({{.*}}) +// CHECK: store i32 %[[SHUFFLE_RES]], ptr %[[SHUFFLE_ELEM_ASCAST]], align 4 + +// CHECK: %[[RED_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr +// CHECK: %[[RED_ALLOC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_ASCAST]], i32 0, i32 0 +// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr +// CHECK: store ptr %[[SHUFFLE_ELEM_ASCAST]], ptr %[[RED_ALLOC_PTR]], align 8 +// CHECK: store ptr %[[RED_ELEM_1]], ptr %[[REMOTE_RED_LIST_ELEM0]], align 8 +// CHECK: } + +// CHECK: define {{.*}} @_omp_reduction_inter_warp_copy_func({{.*}}) {{.*}} { +// CHECK: %[[WARP_MASTER_CMP:.*]] = icmp eq i32 %nvptx_lane_id, 0 +// CHECK: br i1 %[[WARP_MASTER_CMP]], label %[[WARP_MASTER_BB:.*]], label %{{.*}} + +// CHECK: [[WARP_MASTER_BB]]: +// CHECK: %[[WARP_RESULT_PTR:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[WARP_RESULT:.*]] = load ptr, ptr %[[WARP_RESULT_PTR]], align 8 +// CHECK: %[[ALLOC_MEM_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[WARP_RESULT]], i32 0, i32 0 +// CHECK: %[[ALLOC_MEM:.*]] = load ptr, ptr %[[ALLOC_MEM_PTR]], align 8 +// CHECK: %[[WARP_TRANSFER_SLOT:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %nvptx_warp_id +// CHECK: %[[WARP_RED_RES:.*]] = load i32, ptr %[[ALLOC_MEM]], align 4 +// CHECK: store volatile i32 %[[WARP_RED_RES]], ptr addrspace(3) %[[WARP_TRANSFER_SLOT]], align 4 +// CHECK: } diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir index 87ff0ba786648..08a738c8fe4c6 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir @@ -7,7 +7,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : llvm.func @bar() {} llvm.func @baz() {} - omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc { + omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr attributes {byref_element_type = !llvm.array<5 x f32>} alloc { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5> %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr @@ -67,9 +67,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : // CHECK: br label %[[CONT_BB:.*]] // CHECK: [[CONT_BB]]: -// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ] -// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8 -// CHECK-NEXT: br label %.omp.reduction.done +// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %{{.*}}, %{{.*}} ] // CHECK: } // CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 { diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir index b8b7c780a74d0..8950db3fc48aa 100644 --- a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir @@ -109,19 +109,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]] // CHECK-NEXT: store double %[[FINAL_RESULT0]] -// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double +// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double // CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]] // CHECK-NEXT: store double %[[FINAL_RESULT1]] -// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]] // CHECK-NEXT: store float %[[FINAL_RESULT2]] -// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float +// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float // CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]] // CHECK-NEXT: store float %[[FINAL_RESULT3]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir index 9aba72dabf13c..b7cb1026967f3 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir @@ -59,8 +59,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_barrier // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir index dc22fe11666cf..36eb280dfcfa2 100644 --- a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir @@ -62,8 +62,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: icmp eq i32 %[[MASTER]], 1 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] // CHECK: [[THEN]]: -// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] // CHECK-NEXT: store i32 %[[FINAL_RESULT]]