diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index dab200d86d4a0..669d4f0f45714 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4003,18 +4003,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Try to fold intrinsic into select/phi operands. This is legal if: // * The intrinsic is speculatable. - // * The select condition is not a vector, or the intrinsic does not - // perform cross-lane operations. - if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) && - isNotCrossLaneOperation(II)) + // * The operand is one of the following: + // - a phi. + // - a select with a scalar condition. + // - a select with a vector condition and II is not a cross lane operation. + if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) { for (Value *Op : II->args()) { - if (auto *Sel = dyn_cast(Op)) - if (Instruction *R = FoldOpIntoSelect(*II, Sel)) + if (auto *Sel = dyn_cast(Op)) { + bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy(); + if (IsVectorCond && !isNotCrossLaneOperation(II)) + continue; + // Don't replace a scalar select with a more expensive vector select if + // we can't simplify both arms of the select. + bool SimplifyBothArms = + !Op->getType()->isVectorTy() && II->getType()->isVectorTy(); + if (Instruction *R = FoldOpIntoSelect( + *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms)) return R; + } if (auto *Phi = dyn_cast(Op)) if (Instruction *R = foldOpIntoPhi(*II, Phi)) return R; } + } if (Instruction *Shuf = foldShuffledIntrinsicOperands(II)) return Shuf; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 943c223e516fb..ede73f86f1944 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -664,7 +664,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final /// This also works for Cast instructions, which obviously do not have a /// second operand. Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse = false); + bool FoldWithMultiUse = false, + bool SimplifyBothArms = false); /// This is a convenience wrapper function for the above two functions. Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 3f11cae143b81..67e2aae4eda63 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI, } Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, - bool FoldWithMultiUse) { + bool FoldWithMultiUse, + bool SimplifyBothArms) { // Don't modify shared select instructions unless set FoldWithMultiUse if (!SI->hasOneUse() && !FoldWithMultiUse) return nullptr; @@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, if (!NewTV && !NewFV) return nullptr; + if (SimplifyBothArms && !(NewTV && NewFV)) + return nullptr; + // Create an instruction for the arm that did not fold. if (!NewTV) NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this); diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll index 2f1f9fc2b6f9e..fc9ab9f4bf1d3 100644 --- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll +++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll @@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) define i32 @vec_to_scalar_select_scalar(i1 %b) { ; CHECK-LABEL: @vec_to_scalar_select_scalar( -; CHECK-NEXT: [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]]) +; CHECK-NEXT: [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7 ; CHECK-NEXT: ret i32 [[C]] ; %s = select i1 %b, <2 x i32> , <2 x i32> @@ -371,3 +370,36 @@ define float @test_fabs_select_multiuse_both_constant(i1 %cond, float %x) { %fabs = call float @llvm.fabs.f32(float %select) ret float %fabs } + +; Negative test: Don't replace with select between vector mask and zeroinitializer. +define <16 x i1> @test_select_of_active_lane_mask_bound(i64 %base, i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_active_lane_mask_bound( +; CHECK-NEXT: [[S:%.*]] = select i1 [[COND:%.*]], i64 [[N:%.*]], i64 0 +; CHECK-NEXT: [[MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[BASE:%.*]], i64 [[S]]) +; CHECK-NEXT: ret <16 x i1> [[MASK]] +; + %s = select i1 %cond, i64 %n, i64 0 + %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %s) + ret <16 x i1> %mask +} + +define <16 x i1> @test_select_of_active_lane_mask_bound_both_constant(i64 %base, i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_active_lane_mask_bound_both_constant( +; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], <16 x i1> splat (i1 true), <16 x i1> zeroinitializer +; CHECK-NEXT: ret <16 x i1> [[MASK]] +; + %s = select i1 %cond, i64 16, i64 0 + %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %s) + ret <16 x i1> %mask +} + +define { i64, i1 } @test_select_of_overflow_intrinsic_operand(i64 %n, i1 %cond) { +; CHECK-LABEL: @test_select_of_overflow_intrinsic_operand( +; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[N:%.*]], i64 42) +; CHECK-NEXT: [[ADD_OVERFLOW:%.*]] = select i1 [[COND:%.*]], { i64, i1 } [[TMP1]], { i64, i1 } { i64 42, i1 false } +; CHECK-NEXT: ret { i64, i1 } [[ADD_OVERFLOW]] +; + %s = select i1 %cond, i64 %n, i64 0 + %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %s, i64 42) + ret { i64, i1 } %add_overflow +} diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index 3454835d3ad65..1c8b21ceb029b 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -3026,6 +3026,56 @@ join: ret i32 %umax } +define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) { +; CHECK-LABEL: @cross_lane_intrinsic_over_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]]) +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @may_exit() +; CHECK-NEXT: ret i32 [[PHI]] +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ] + call void @may_exit() + %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi) + ret i32 %sum +} + +define { i64, i1 } @overflow_intrinsic_over_phi(i1 %c, i64 %a) { +; CHECK-LABEL: @overflow_intrinsic_over_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 1) +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[PHI:%.*]] = phi { i64, i1 } [ [[TMP0]], [[IF]] ], [ { i64 1, i1 false }, [[ENTRY:%.*]] ] +; CHECK-NEXT: call void @may_exit() +; CHECK-NEXT: ret { i64, i1 } [[PHI]] +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %phi = phi i64 [ %a, %if ], [ 0, %entry ] + call void @may_exit() + %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %phi, i64 1) + ret { i64, i1 } %add_overflow +} + define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) { ; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll index d025aedda7170..ccfcca1fe7cb5 100644 --- a/llvm/test/Transforms/InstCombine/select_frexp.ll +++ b/llvm/test/Transforms/InstCombine/select_frexp.ll @@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) { define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) { ; CHECK-LABEL: define i32 @test_select_frexp_extract_exp( ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) { -; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]] -; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]]) +; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) ; CHECK-NEXT: [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1 -; CHECK-NEXT: ret i32 [[FREXP_1]] +; CHECK-NEXT: [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]] +; CHECK-NEXT: ret i32 [[FREXP_2]] ; %sel = select i1 %cond, float 1.000000e+00, float %x %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel) @@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) { ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) { ; CHECK-NEXT: [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) ; CHECK-NEXT: [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0 -; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] +; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] ; CHECK-NEXT: ret float [[SELECT_FREXP]] ; %sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll index 63279b04bda28..92ea2c608fe53 100644 --- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll +++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll @@ -35,9 +35,8 @@ define void @ham() #1 { ; CHECK-NEXT: [[SNORK_EXIT:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( zeroinitializer, zeroinitializer, i64 0) -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], [[TMP2]], undef -; CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( [[SPEC_SELECT]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv4f32.nxv16f32( undef, i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], zeroinitializer, [[TMP2]] ; CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, zeroinitializer, zeroinitializer, zeroinitializer, [[TMP3]]) ; CHECK-NEXT: ret void ;