Skip to content

Commit e23a9e7

Browse files
committed
[OpenMP][flang] Add initial support for by-ref reductions on the GPU
Adds initial support for GPU by-ref reductions. In particular, this diff adds support for reductions on scalar allocatables where reductions happen on loops nested in `target` regions. For example: ```fortran integer :: i real, allocatable :: scalar_alloc allocate(scalar_alloc) scalar_alloc = 0 !$omp target map(tofrom: scalar_alloc) !$omp parallel do reduction(+: scalar_alloc) do i = 1, 1000000 scalar_alloc = scalar_alloc + 1 end do !$omp end target ``` This PR supports by-ref reductions on the intra- and inter-warp levels. So far, there are still steps to be takens for full support of by-ref reductions, for example: * Support inter-block value combination is still not supported. Therefore, `target teams distribute parallel do` is still not supported. * Support for dynamically-sized arrays still needs to be added. * Support for more than one allocatable/array on the same `reduction` clause.
1 parent 8c8bead commit e23a9e7

34 files changed

+294
-84
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,8 +1784,8 @@ void CGOpenMPRuntimeGPU::emitReduction(
17841784

17851785
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
17861786
cantFail(OMPBuilder.createReductionsGPU(
1787-
OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
1788-
llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1787+
OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, {}, false,
1788+
TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
17891789
CGF.getTarget().getGridValue(),
17901790
C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
17911791
CGF.Builder.restoreIP(AfterIP);

flang/include/flang/Optimizer/Dialect/FIROps.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3743,7 +3743,8 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove,
37433743
}];
37443744

37453745
let arguments = (ins SymbolNameAttr:$sym_name,
3746-
TypeAttr:$type);
3746+
TypeAttr:$type,
3747+
OptionalAttr<TypeAttr>:$byref_element_type);
37473748

37483749
let regions = (region MaxSizedRegion<1>:$allocRegion,
37493750
AnyRegion:$initializerRegion,

flang/lib/Lower/Support/ReductionProcessor.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,10 +573,15 @@ OpType ReductionProcessor::createDeclareReduction(
573573

574574
mlir::OpBuilder modBuilder(module.getBodyRegion());
575575
mlir::Type valTy = fir::unwrapRefType(type);
576+
mlir::TypeAttr boxedTy{};
577+
576578
if (!isByRef)
577579
type = valTy;
578580

579-
decl = OpType::create(modBuilder, loc, reductionOpName, type);
581+
if (isByRef)
582+
boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy));
583+
584+
decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy);
580585
createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
581586
isByRef);
582587

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,8 @@ class DoConcurrentConversion
851851
if (!ompReducer) {
852852
ompReducer = mlir::omp::DeclareReductionOp::create(
853853
rewriter, firReducer.getLoc(), ompReducerName,
854-
firReducer.getTypeAttr().getValue());
854+
firReducer.getTypeAttr().getValue(),
855+
firReducer.getByrefElementTypeAttr());
855856

856857
cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
857858
ompReducer.getAllocRegion());

flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ subroutine red_and_delayed_private
2222
! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
2323

2424
! CHECK-LABEL: omp.declare_reduction
25-
! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc
25+
! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> attributes {byref_element_type = i32} alloc
2626

2727
! CHECK-LABEL: _QPred_and_delayed_private
2828
! CHECK: omp.parallel

flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ program reduce
1818

1919
end program
2020

21-
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
21+
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
2222
! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
2323
! CHECK: omp.yield(%[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
2424
! CHECK-LABEL: } init {

flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ program reduce
1212

1313
end program
1414

15-
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> alloc {
15+
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> {{.*}} alloc {
1616
! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
1717
! CHECK: omp.yield(%[[VAL_15]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>)
1818
! CHECK-LABEL: } init {

flang/test/Lower/OpenMP/parallel-reduction-array.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ program reduce
1717
print *,i
1818
end program
1919

20-
! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
20+
! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> attributes {byref_element_type = !fir.array<3xi32>} alloc {
2121
! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
2222
! CPU: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
2323
! CPU-LABEL: } init {

flang/test/Lower/OpenMP/parallel-reduction-array2.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ program reduce
1313
print *,i
1414
end program
1515

16-
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
16+
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> {{.*}} alloc {
1717
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
1818
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
1919
! CHECK-LABEL: } init {

flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ program reduce
1919

2020
end program
2121

22-
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc {
22+
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
2323
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
2424
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
2525
! CHECK-LABEL: } init {

0 commit comments

Comments
 (0)