-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Flang] Allow Intrinsic simpification with min/maxloc dim and scalar result. #75820
Conversation
…result. This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions.
@llvm/pr-subscribers-flang-fir-hlfir Author: David Green (davemgreen) ChangesThis makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. Full diff: https://github.com/llvm/llvm-project/pull/75820.diff 2 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index c89ee6d5e20391..12f354a47c2bcc 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1162,11 +1162,14 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
mlir::Operation::operand_range args = call.getArgs();
- mlir::Value back = args[6];
+ mlir::SymbolRefAttr callee = call.getCalleeAttr();
+ mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
+ bool isDim = funcNameBase.ends_with("Dim");
+ mlir::Value back = args[isDim ? 7 : 6];
if (isTrueOrNotConstant(back))
return;
- mlir::Value mask = args[5];
+ mlir::Value mask = args[isDim ? 6 : 5];
mlir::Value maskDef = findMaskDef(mask);
// maskDef is set to NULL when the defining op is not one we accept.
@@ -1175,10 +1178,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
if (maskDef == NULL)
return;
- mlir::SymbolRefAttr callee = call.getCalleeAttr();
- mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
unsigned rank = getDimCount(args[1]);
- if (funcNameBase.ends_with("Dim") || !(rank > 0))
+ if ((isDim && rank != 1) || !(rank > 0))
return;
fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
@@ -1234,7 +1235,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
mlir::func::FuncOp newFunc =
getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
builder.create<fir::CallOp>(loc, newFunc,
- mlir::ValueRange{args[0], args[1], args[5]});
+ mlir::ValueRange{args[0], args[1], mask});
call->dropAllReferences();
call->erase();
}
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index 0bd6ac7c436ff2..d42924a17a804d 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2115,13 +2115,13 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
// CHECK-NOT: fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
// -----
-// Check Minloc is not simplified when DIM arg is set
+// Check Minloc is simplified when DIM arg is set so long as the result is scalar
-func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
+func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
%0 = fir.alloca !fir.box<!fir.heap<i32>>
%c10 = arith.constant 10 : index
%c1 = arith.constant 1 : index
- %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_doesntwork1d_dim", uniq_name = "_QFtestminloc_doesntwork1d_dimEtestminloc_doesntwork1d_dim"}
+ %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_1d_dim", uniq_name = "_QFtestminloc_1d_dimEtestminloc_1d_dim"}
%2 = fir.shape %c1 : (index) -> !fir.shape<1>
%3 = fir.array_load %1(%2) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.array<1xi32>
%4 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -2156,11 +2156,65 @@ func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {f
%21 = fir.load %1 : !fir.ref<!fir.array<1xi32>>
return %21 : !fir.array<1xi32>
}
-// CHECK-LABEL: func.func @_QPtestminloc_doesntwork1d_dim(
+// CHECK-LABEL: func.func @_QPtestminloc_1d_dim(
// CHECK-SAME: %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK-NOT: fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK: fir.call @_FortranAMinlocDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
-// CHECK-NOT: fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK: fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+
+// CHECK-LABEL: func.func private @_FortranAMinlocDimx1_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-NEXT: %[[V0:.*]] = fir.alloca i32
+// CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
+// CHECK-NEXT: %c1 = arith.constant 1 : index
+// CHECK-NEXT: %[[V1:.*]] = fir.allocmem !fir.array<1xi32>
+// CHECK-NEXT: %[[V2:.*]] = fir.shape %c1 : (index) -> !fir.shape<1>
+// CHECK-NEXT: %[[V3:.*]] = fir.embox %[[V1]](%[[V2]]) : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<1xi32>>>
+// CHECK-NEXT: %c0 = arith.constant 0 : index
+// CHECK-NEXT: %[[V4:.*]] = fir.coordinate_of %[[V3]], %c0 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: fir.store %c0_i32 to %[[V4]] : !fir.ref<i32>
+// CHECK-NEXT: %c0_0 = arith.constant 0 : index
+// CHECK-NEXT: %[[V5:.*]] = fir.convert %arg1 : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
+// CHECK-NEXT: %c1_i32 = arith.constant 1 : i32
+// CHECK-NEXT: %c0_i32_1 = arith.constant 0 : i32
+// CHECK-NEXT: fir.store %c0_i32_1 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %c2147483647_i32 = arith.constant 2147483647 : i32
+// CHECK-NEXT: %c1_2 = arith.constant 1 : index
+// CHECK-NEXT: %c0_3 = arith.constant 0 : index
+// CHECK-NEXT: %[[V6:.*]]:3 = fir.box_dims %[[V5]], %c0_3 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK-NEXT: %[[V7:.*]] = arith.subi %[[V6]]#1, %c1_2 : index
+// CHECK-NEXT: %[[V8:.*]] = fir.do_loop %arg3 = %c0_0 to %[[V7]] step %c1_2 iter_args(%arg4 = %c2147483647_i32) -> (i32) {
+// CHECK-NEXT: fir.store %c1_i32 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V12:.*]] = fir.coordinate_of %[[V5]], %arg3 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: %[[V13:.*]] = fir.load %[[V12]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V14:.*]] = arith.cmpi slt, %[[V13]], %arg4 : i32
+// CHECK-NEXT: %[[V15:.*]] = fir.if %[[V14]] -> (i32) {
+// CHECK-NEXT: %c1_i32_4 = arith.constant 1 : i32
+// CHECK-NEXT: %c0_5 = arith.constant 0 : index
+// CHECK-NEXT: %[[V16:.*]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: %[[V17:.*]] = fir.convert %arg3 : (index) -> i32
+// CHECK-NEXT: %[[V18:.*]] = arith.addi %[[V17]], %c1_i32_4 : i32
+// CHECK-NEXT: fir.store %[[V18]] to %[[V16]] : !fir.ref<i32>
+// CHECK-NEXT: fir.result %[[V13]] : i32
+// CHECK-NEXT: } else {
+// CHECK-NEXT: fir.result %arg4 : i32
+// CHECK-NEXT: }
+// CHECK-NEXT: fir.result %[[V15]] : i32
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[V9:.*]] = fir.load %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V10:.*]] = arith.cmpi eq, %[[V9]], %c1_i32 : i32
+// CHECK-NEXT: fir.if %[[V10]] {
+// CHECK-NEXT: %c2147483647_i32_4 = arith.constant 2147483647 : i32
+// CHECK-NEXT: %[[V12]] = arith.cmpi eq, %c2147483647_i32_4, %[[V8]] : i32
+// CHECK-NEXT: fir.if %[[V12]] {
+// CHECK-NEXT: %c0_5 = arith.constant 0 : index
+// CHECK-NEXT: %[[V13]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: fir.store %c1_i32 to %[[V13]] : !fir.ref<i32>
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[V11:.*]] = fir.convert %arg0 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
+// CHECK-NEXT: fir.store %[[V3]] to %[[V11]] : !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+
// -----
// Check Minloc is not simplified when dimension of inputArr is unknown
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks |
… scalar result. (llvm#75820)" This reverts commit 701f647. The commit breaks some uses of the 'maxloc' intrinsic.
This change breaks some uses of the
I'm reverting this change in #76184. |
I'll be happy to review a follow-on pull request to verify that it passes all of our internal tests. |
Oh Thanks, I see. It looks like it is expecting the type name to already be encoded into the name of the function. So with multiple calls of different types it can call the wrong function. |
…result. (llvm#75820) This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions.
Thanks for the quick response! And I forgot to mention, I didn't provide a test case, but the current runtime functions check the arguments, and it looks like the inlined version doesn't do so. |
…result. (llvm#75820) This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions.
Do you mean it checks that the dim is a valid value? |
The runtime implementation has at least two checks. One is to ensure that |
The dim we could add a check for to make sure it is a constant. There is only really one valid value (=1) for the dim in this case, so it assumes that the runtime value is correct (and constants are already validated earlier). You would end up losing the performance for the non-constant case where you know there was only one valid value. I will defer to the others who know more about flang to weigh in on whether that might be better. Mask might be more difficult than dim to detect statically. At least in llvm/clang I believe it is common to say that with/without debug info should not change the generated code. It could perhaps be something that behaves differently at -O0 vs -O2 for example, and from a quick test I just ran it appears that might already be the case. The lack of optimizations means we always generate the runtime call that performs the relevant checks. |
I guess it is alright to relax some of these checks with optimisations.
The intrinsic simplification pass is invoked only when optimizing for speed (O1 and above).
|
…result (#76194) This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of #75820 with the typename added to the generated function.
…result (llvm#76194) This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of llvm#75820 with the typename added to the generated function.
…result. This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of llvm#75820 with an extra alteration to the end of genRuntimeMinMaxlocBody to make sure we convert the output array to the correct type (a `box<heap<i32>>`, not `box<heap<array<1xi32>>>`) to prevent writing the wrong type into it. This still allocates the data as a `array<1xi32>`, converting that into a i32 assuming that is safe. The alternative would be to allocate the data as a i32 and change more of the accesses to it throughout genRuntimeMinMaxlocBody.
…result. This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of llvm#75820 with an extra alteration to the end of genRuntimeMinMaxlocBody to make sure we convert the output array to the correct type (a `box<heap<i32>>`, not `box<heap<array<1xi32>>>`) to prevent writing the wrong type into it. This still allocates the data as a `array<1xi32>`, converting that into a i32 assuming that is safe. The alternative would be to allocate the data as a i32 and change more of the accesses to it throughout genRuntimeMinMaxlocBody.
This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions.