-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Flang] Allow Intrinsic simpification with min/maxloc dim and scalar result #81619
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: David Green (davemgreen) ChangesThis makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of #76194 with an extra alteration to the end of genRuntimeMinMaxlocBody to make sure we convert the output array to the correct type (a Full diff: https://github.com/llvm/llvm-project/pull/81619.diff 2 Files Affected:
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 86343e23c6e5db..1e1aa2a2de7c75 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -692,7 +692,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
unsigned rank, int maskRank,
mlir::Type elementType,
mlir::Type maskElemType,
- mlir::Type resultElemTy) {
+ mlir::Type resultElemTy, bool isDim) {
auto init = [isMax](fir::FirOpBuilder builder, mlir::Location loc,
mlir::Type elementType) {
if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
@@ -877,16 +877,27 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
maskElemType, resultArr, maskRank == 0);
// Store newly created output array to the reference passed in
- fir::SequenceType::Shape resultShape(1, rank);
- mlir::Type outputArrTy = fir::SequenceType::get(resultShape, resultElemTy);
- mlir::Type outputHeapTy = fir::HeapType::get(outputArrTy);
- mlir::Type outputBoxTy = fir::BoxType::get(outputHeapTy);
- mlir::Type outputRefTy = builder.getRefType(outputBoxTy);
- mlir::Value outputArr = builder.create<fir::ConvertOp>(
- loc, outputRefTy, funcOp.front().getArgument(0));
-
- // Store nearly created array to output array
- builder.create<fir::StoreOp>(loc, resultArr, outputArr);
+ if (isDim) {
+ mlir::Type resultBoxTy =
+ fir::BoxType::get(fir::HeapType::get(resultElemTy));
+ mlir::Value outputArr = builder.create<fir::ConvertOp>(
+ loc, builder.getRefType(resultBoxTy), funcOp.front().getArgument(0));
+ mlir::Value resultArrScalar = builder.create<fir::ConvertOp>(
+ loc, fir::HeapType::get(resultElemTy), resultArrInit);
+ mlir::Value resultBox =
+ builder.create<fir::EmboxOp>(loc, resultBoxTy, resultArrScalar);
+ builder.create<fir::StoreOp>(loc, resultBox, outputArr);
+ } else {
+ fir::SequenceType::Shape resultShape(1, rank);
+ mlir::Type outputArrTy = fir::SequenceType::get(resultShape, resultElemTy);
+ mlir::Type outputHeapTy = fir::HeapType::get(outputArrTy);
+ mlir::Type outputBoxTy = fir::BoxType::get(outputHeapTy);
+ mlir::Type outputRefTy = builder.getRefType(outputBoxTy);
+ mlir::Value outputArr = builder.create<fir::ConvertOp>(
+ loc, outputRefTy, funcOp.front().getArgument(0));
+ builder.create<fir::StoreOp>(loc, resultArr, outputArr);
+ }
+
builder.create<mlir::func::ReturnOp>(loc);
}
@@ -1165,11 +1176,14 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
mlir::Operation::operand_range args = call.getArgs();
- mlir::Value back = args[6];
+ mlir::SymbolRefAttr callee = call.getCalleeAttr();
+ mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
+ bool isDim = funcNameBase.ends_with("Dim");
+ mlir::Value back = args[isDim ? 7 : 6];
if (isTrueOrNotConstant(back))
return;
- mlir::Value mask = args[5];
+ mlir::Value mask = args[isDim ? 6 : 5];
mlir::Value maskDef = findMaskDef(mask);
// maskDef is set to NULL when the defining op is not one we accept.
@@ -1178,10 +1192,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
if (maskDef == NULL)
return;
- mlir::SymbolRefAttr callee = call.getCalleeAttr();
- mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
unsigned rank = getDimCount(args[1]);
- if (funcNameBase.ends_with("Dim") || !(rank > 0))
+ if ((isDim && rank != 1) || !(rank > 0))
return;
fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
@@ -1222,22 +1234,24 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
llvm::raw_string_ostream nameOS(funcName);
outType.print(nameOS);
+ if (isDim)
+ nameOS << '_' << inputType;
nameOS << '_' << fmfString;
auto typeGenerator = [rank](fir::FirOpBuilder &builder) {
return genRuntimeMinlocType(builder, rank);
};
auto bodyGenerator = [rank, maskRank, inputType, logicalElemType, outType,
- isMax](fir::FirOpBuilder &builder,
+ isMax, isDim](fir::FirOpBuilder &builder,
mlir::func::FuncOp &funcOp) {
genRuntimeMinMaxlocBody(builder, funcOp, isMax, rank, maskRank, inputType,
- logicalElemType, outType);
+ logicalElemType, outType, isDim);
};
mlir::func::FuncOp newFunc =
getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
builder.create<fir::CallOp>(loc, newFunc,
- mlir::ValueRange{args[0], args[1], args[5]});
+ mlir::ValueRange{args[0], args[1], mask});
call->dropAllReferences();
call->erase();
}
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index cd059cc797a3f4..4eabddc2f5ca32 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2116,13 +2116,13 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
// CHECK-NOT: fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
// -----
-// Check Minloc is not simplified when DIM arg is set
+// Check Minloc is simplified when DIM arg is set so long as the result is scalar
-func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
+func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
%0 = fir.alloca !fir.box<!fir.heap<i32>>
%c10 = arith.constant 10 : index
%c1 = arith.constant 1 : index
- %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_doesntwork1d_dim", uniq_name = "_QFtestminloc_doesntwork1d_dimEtestminloc_doesntwork1d_dim"}
+ %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_1d_dim", uniq_name = "_QFtestminloc_1d_dimEtestminloc_1d_dim"}
%2 = fir.shape %c1 : (index) -> !fir.shape<1>
%3 = fir.array_load %1(%2) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.array<1xi32>
%4 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -2157,11 +2157,68 @@ func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {f
%21 = fir.load %1 : !fir.ref<!fir.array<1xi32>>
return %21 : !fir.array<1xi32>
}
-// CHECK-LABEL: func.func @_QPtestminloc_doesntwork1d_dim(
+// CHECK-LABEL: func.func @_QPtestminloc_1d_dim(
// CHECK-SAME: %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK-NOT: fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-// CHECK: fir.call @_FortranAMinlocDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
-// CHECK-NOT: fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK: fir.call @_FortranAMinlocDimx1_i32_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+
+// CHECK-LABEL: func.func private @_FortranAMinlocDimx1_i32_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-NEXT: %[[V0:.*]] = fir.alloca i32
+// CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
+// CHECK-NEXT: %c1 = arith.constant 1 : index
+// CHECK-NEXT: %[[V1:.*]] = fir.allocmem !fir.array<1xi32>
+// CHECK-NEXT: %[[V2:.*]] = fir.shape %c1 : (index) -> !fir.shape<1>
+// CHECK-NEXT: %[[V3:.*]] = fir.embox %[[V1]](%[[V2]]) : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<1xi32>>>
+// CHECK-NEXT: %c0 = arith.constant 0 : index
+// CHECK-NEXT: %[[V4:.*]] = fir.coordinate_of %[[V3]], %c0 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: fir.store %c0_i32 to %[[V4]] : !fir.ref<i32>
+// CHECK-NEXT: %c0_0 = arith.constant 0 : index
+// CHECK-NEXT: %[[V5:.*]] = fir.convert %arg1 : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
+// CHECK-NEXT: %c1_i32 = arith.constant 1 : i32
+// CHECK-NEXT: %c0_i32_1 = arith.constant 0 : i32
+// CHECK-NEXT: fir.store %c0_i32_1 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %c2147483647_i32 = arith.constant 2147483647 : i32
+// CHECK-NEXT: %c1_2 = arith.constant 1 : index
+// CHECK-NEXT: %c0_3 = arith.constant 0 : index
+// CHECK-NEXT: %[[V6:.*]]:3 = fir.box_dims %[[V5]], %c0_3 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK-NEXT: %[[V7:.*]] = arith.subi %[[V6]]#1, %c1_2 : index
+// CHECK-NEXT: %[[V8:.*]] = fir.do_loop %arg3 = %c0_0 to %[[V7]] step %c1_2 iter_args(%arg4 = %c2147483647_i32) -> (i32) {
+// CHECK-NEXT: %c1_i32_4 = arith.constant 1 : i32
+// CHECK-NEXT: fir.store %c1_i32_4 to %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V12:.*]] = fir.coordinate_of %[[V5]], %arg3 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: %[[V13:.*]] = fir.load %[[V12]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V14:.*]] = arith.cmpi slt, %[[V13]], %arg4 : i32
+// CHECK-NEXT: %[[V15:.*]] = fir.if %[[V14]] -> (i32) {
+// CHECK-NEXT: %c1_i32_5 = arith.constant 1 : i32
+// CHECK-NEXT: %c0_6 = arith.constant 0 : index
+// CHECK-NEXT: %[[V16:.*]] = fir.coordinate_of %[[V3]], %c0_6 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: %[[V17:.*]] = fir.convert %arg3 : (index) -> i32
+// CHECK-NEXT: %[[V18:.*]] = arith.addi %[[V17]], %c1_i32_5 : i32
+// CHECK-NEXT: fir.store %[[V18]] to %[[V16]] : !fir.ref<i32>
+// CHECK-NEXT: fir.result %[[V13]] : i32
+// CHECK-NEXT: } else {
+// CHECK-NEXT: fir.result %arg4 : i32
+// CHECK-NEXT: }
+// CHECK-NEXT: fir.result %[[V15]] : i32
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[V9:.*]] = fir.load %[[V0]] : !fir.ref<i32>
+// CHECK-NEXT: %[[V10:.*]] = arith.cmpi eq, %[[V9]], %c1_i32 : i32
+// CHECK-NEXT: fir.if %[[V10]] {
+// CHECK-NEXT: %c2147483647_i32_4 = arith.constant 2147483647 : i32
+// CHECK-NEXT: %[[V12]] = arith.cmpi eq, %c2147483647_i32_4, %[[V8]] : i32
+// CHECK-NEXT: fir.if %[[V12]] {
+// CHECK-NEXT: %c0_5 = arith.constant 0 : index
+// CHECK-NEXT: %[[V13]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
+// CHECK-NEXT: fir.store %c1_i32 to %[[V13]] : !fir.ref<i32>
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[V11:.*]] = fir.convert %arg0 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
+// CHECK-NEXT: %[[V12:.*]] = fir.convert %[[V1]] : (!fir.heap<!fir.array<1xi32>>) -> !fir.heap<i32>
+// CHECK-NEXT: %[[V13:.*]] = fir.embox %[[V12]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+// CHECK-NEXT: fir.store %[[V13]] to %[[V11]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+
// -----
// Check Minloc is not simplified when dimension of inputArr is unknown
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
a88e698
to
c738805
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've run our internal tests with this change, and I'm seeing problems. It will take me a little while to produce sanitized test cases, but please don't merge these changes as is. I will create the test case or cases promptly.
OK thanks for checking - Maybe this cannot get away with converting the array to an int. Let me put up a new version where it changes the alloc type and if there are still problems we can try and figure out why. |
I've looked at the test results. There were two problems. First, one of our tests started issuing warnings at compile time when lowering. Here's an example that shows the problem:
Second,I'm seeing differences in what floating point exceptions get raised when doing IEEE arithmetic. Here's a program that shows that difference:
|
Oh, thanks for the details. That certainly isn't what I expected the problems to be! I had looked into the Nan issue a little last week, but did not think to test it with inf. This will be a pre-existing issue that is now starting to happen with dim too, as in a test like the one below. I'll see about getting that fixed.
For the other issue - I can imagine that a previous runtime error now becomes a compile time warning. I don't see the same warning, but it seems to go from a runtime error to just working (there is only really one valid value for dim as far as I understand) and it appears gfortran does the same. I can maybe make it only work for constant dim values, if that would be better than not producing a runtime error message? |
To be clear, the warning is coming from lowering. We don't want users to get any warnings from lowering. Here's what I get:
|
Which options are you using to get that? Cheers |
I just typed |
…result. This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions. This is a recommit of llvm#75820 with an extra alteration to the end of genRuntimeMinMaxlocBody to make sure we convert the output array to the correct type (a `box<heap<i32>>`, not `box<heap<array<1xi32>>>`) to prevent writing the wrong type into it. This still allocates the data as a `array<1xi32>`, converting that into a i32 assuming that is safe. The alternative would be to allocate the data as a i32 and change more of the accesses to it throughout genRuntimeMinMaxlocBody.
c738805
to
e16c4c1
Compare
I've rebased after #82313. As far as I understand the Nan issue should now be fixed. I have a couple of other questions:
Thanks |
So far as I can tell, the standard does not say what should happen for an invalid DIM index, so I suppose this is fine so long as there's a tangible benefit to skipping this check. @psteinfeld do you have an opinion on this? Are you skipping this check for performance, or is this just because there isn't a convenient way to output an error message and exit from FIR?
I believe this should be safe. |
Thanks - It just doesn't naturally check at runtime. Constants should be checked at compile time, as far as I understand. It will be the variable case from https://godbolt.org/z/f86YE1WeM that might not be diagnosed. (I had noticed that gfortran does the same thing, assuming that the dim will be a correct value). I can add a check at compile time that the dim is a constant and the correct value if that would be more preferable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay I had a look at the runtime library and I don't think that catches a dynamically out of bounds dim either. So I think this is okay. +1 from me
I don't think a compile time check for constants is right here (except maybe as an assertion). This sort of thing should be caught in semantics so we can generate a properly formatted warning/error. What I would have considered was a runtime check for DIM, but as the runtime library code appears not to be doing that, I think it is okay not to do that here either.
Thanks for your work on this
Thanks. This has been committed twice already. @psteinfeld would you be unhappy with it being committed now? I'd like to avoid it being reverted again if we can avoid it! Cheers. |
I'll retest and post my results. Stay tuned .... |
My testing looks good! Thanks for sticking with this. |
Thanks for testing. I appreciate the patience and will attempt not to use so much of your time in the future. Please let me know if anything still goes wrong. Thanks |
This makes an adjustment to the existing fir minloc/maxloc generation code to handle functions with a dim=1 that produce a scalar result. This should allow us to get the same benefits as the existing generated minmax reductions.
This is a recommit of #76194 with an extra alteration to the end of genRuntimeMinMaxlocBody to make sure we convert the output array to the correct type (a
box<heap<i32>>
, notbox<heap<array<1xi32>>>
) to prevent writing the wrong type of box into it. This still allocates the data as aarray<1xi32>
, converting it into a i32 assuming that is safe. An alternative would be to allocate the data as a i32 and change more of the accesses to it throughout genRuntimeMinMaxlocBody.