diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 1a279b6198182..182a1eb760abb 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2181,8 +2181,8 @@ class LSRInstance { SmallSetVector InsertedNonLCSSAInsts; void OptimizeShadowIV(); - bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); - ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); + bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse); + Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse); void OptimizeLoopTermCond(); void ChainInstruction(Instruction *UserInst, Instruction *IVOper, @@ -2416,7 +2416,7 @@ void LSRInstance::OptimizeShadowIV() { /// If Cond has an operand that is an expression of an IV, set the IV user and /// stride information and return true, otherwise return false. -bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { +bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) { for (IVStrideUse &U : IU) if (U.getUser() == Cond) { // NOTE: we could handle setcc instructions with multiple uses here, but @@ -2476,7 +2476,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { /// This function solves this problem by detecting this type of loop and /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting /// the instructions for the maximum computation. -ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { +Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) { // Check that the loop matches the pattern we're looking for. if (Cond->getPredicate() != CmpInst::ICMP_EQ && Cond->getPredicate() != CmpInst::ICMP_NE) @@ -2620,15 +2620,26 @@ LSRInstance::OptimizeLoopTermCond() { // one register value. BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); - if (!TermBr) + if (!TermBr || TermBr->isUnconditional()) continue; - // FIXME: Overly conservative, termination condition could be an 'or' etc.. - if (TermBr->isUnconditional() || !isa(TermBr->getCondition())) + + Instruction *Cond = dyn_cast(TermBr->getCondition()); + bool CondImmediatelyBeforeTerm = Cond && Cond->getNextNode() == TermBr; + // If the argument to TermBr is an extractelement, then the source of that + // instruction is what's generated the condition. + auto *Extract = dyn_cast_or_null(Cond); + if (Extract) { + Cond = dyn_cast(Extract->getVectorOperand()); + if (Cond && CondImmediatelyBeforeTerm) + CondImmediatelyBeforeTerm = Cond->getNextNode() == Extract; + } + // FIXME: We could do more here, like handling logical operations where one + // side is a cmp that uses an induction variable. + if (!Cond) continue; // Search IVUsesByStride to find Cond's IVUse if there is one. IVStrideUse *CondUse = nullptr; - ICmpInst *Cond = cast(TermBr->getCondition()); if (!FindIVUserForCond(Cond, CondUse)) continue; @@ -2638,7 +2649,8 @@ LSRInstance::OptimizeLoopTermCond() { // One consequence of doing this now is that it disrupts the count-down // optimization. That's not always a bad thing though, because in such // cases it may still be worthwhile to avoid a max. - Cond = OptimizeMax(Cond, CondUse); + if (auto *Cmp = dyn_cast(Cond)) + Cond = OptimizeMax(Cmp, CondUse); // If this exiting block dominates the latch block, it may also use // the post-inc value if it won't be shared with other uses. @@ -2703,13 +2715,13 @@ LSRInstance::OptimizeLoopTermCond() { // It's possible for the setcc instruction to be anywhere in the loop, and // possible for it to have multiple users. If it is not immediately before // the exiting block branch, move it. - if (Cond->getNextNode() != TermBr) { + if (!CondImmediatelyBeforeTerm && isa(Cond) && !Extract) { if (Cond->hasOneUse()) { Cond->moveBefore(TermBr->getIterator()); } else { // Clone the terminating condition and insert into the loopend. - ICmpInst *OldCond = Cond; - Cond = cast(Cond->clone()); + Instruction *OldCond = Cond; + Cond = Cond->clone(); Cond->setName(L->getHeader()->getName() + ".termcond"); Cond->insertInto(ExitingBlock, TermBr->getIterator()); diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index d67aa08125f74..79f0cd345f95c 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -16,32 +16,32 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov w8, #100 // =0x64 -; CHECK-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntd x10 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x11, x9 +; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z0.d ; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: zip1 p1.d, p1.d, p1.d +; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] ; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] -; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: add x1, x1, x11 +; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: mov z1.d, p2/m, z7.d ; CHECK-NEXT: mov z0.d, p1/m, z6.d -; CHECK-NEXT: whilelo p1.d, x11, x8 -; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: whilelo p1.d, x8, x9 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d @@ -213,19 +213,18 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov w8, #100 // =0x64 -; CHECK-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntd x10 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cnth x11 -; CHECK-NEXT: mov x12, x9 +; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2] +; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x8, lsl #2] ; CHECK-NEXT: mov z6.d, z0.d ; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: add x2, x2, x11 +; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: and z2.d, z2.d, #0xffffffff ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d @@ -234,16 +233,15 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] ; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] -; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: add x1, x1, x11 +; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: mov z1.d, p2/m, z7.d ; CHECK-NEXT: mov z0.d, p1/m, z6.d -; CHECK-NEXT: whilelo p1.d, x12, x8 -; CHECK-NEXT: add x12, x12, x9 +; CHECK-NEXT: whilelo p1.d, x8, x9 ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll new file mode 100644 index 0000000000000..5590208865bd2 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll @@ -0,0 +1,205 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -loop-reduce %s -S -o - | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Tests where the loop termination condition is not generated by a compare. + +; The call to get.active.lane.mask in the loop should use the postincrement +; value of %index. +define void @lane_mask(ptr %dst, i64 %n) #0 { +; CHECK-LABEL: define void @lane_mask( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( splat (i32 1), ptr align 4 [[SCEVGEP]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP1]] = add i64 [[IV]], [[VSCALEX4]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]]) +; CHECK-NEXT: [[COND:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vscalex4 = shl i64 %vscale, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n) + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %active.lane.mask = phi [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ] + %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv + tail call void @llvm.masked.store.nxv4i32.p0( splat (i32 1), ptr %gep, i32 4, %active.lane.mask) + %iv.next = add i64 %iv, %vscalex4 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n) + %cond = extractelement %active.lane.mask.next, i64 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; The store between the call and the branch shouldn't prevent the +; postincement value from being used. +define void @lane_mask_not_last(ptr %dst, i64 %n) #0 { +; CHECK-LABEL: define void @lane_mask_not_last( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IV]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP0]] = add i64 [[IV]], [[VSCALEX4]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP0]], i64 [[N]]) +; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0( splat (i32 1), ptr align 4 [[SCEVGEP]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[COND:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vscalex4 = shl i64 %vscale, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n) + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %active.lane.mask = phi [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ] + %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv + %iv.next = add i64 %iv, %vscalex4 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n) + tail call void @llvm.masked.store.nxv4i32.p0( splat (i32 1), ptr %gep, i32 4, %active.lane.mask) + %cond = extractelement %active.lane.mask.next, i64 0 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; The call to cmp_fn in the loop should use the postincrement value of %iv. +define void @uses_cmp_fn(ptr %dst, i64 %n) { +; CHECK-LABEL: define void @uses_cmp_fn( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[LSR_IV]], 2 +; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV_NEXT]]) +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv + store i32 0, ptr %gep, align 4 + %iv.next = add i64 %iv, 1 + %cond = tail call i1 @cmp_fn(i64 %iv.next) + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; The store between the call and the branch shouldn't prevent the +; postincement value from being used. +define void @uses_cmp_fn_not_last(ptr %dst, i64 %n) { +; CHECK-LABEL: define void @uses_cmp_fn_not_last( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2 +; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] +; CHECK-NEXT: [[LSR_IV]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]]) +; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv + %iv.next = add i64 %iv, 1 + %cond = tail call i1 @cmp_fn(i64 %iv.next) + store i32 0, ptr %gep, align 4 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; cmp2 will use a preincrement induction variable as it isn't directly the loop +; termination condition. +; FIXME: We could potentially handle this by examining the operands of the 'and' +; instruction. +define void @cmp_and(ptr %dst, i64 %n) { +; CHECK-LABEL: define void @cmp_and( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[LOOP]] ], [ [[DST]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[TMP0]], %[[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = load i64, ptr [[LSR_IV1]], align 8 +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[VAL]], [[N]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[LSR_IV]], 0 +; CHECK-NEXT: [[COND:%.*]] = and i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv + %val = load i64, ptr %gep, align 8 + %iv.next = add i64 %iv, 1 + %cmp1 = icmp ne i64 %val, %n + %cmp2 = icmp ne i64 %iv.next, %n + %cond = and i1 %cmp1, %cmp2 + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + + +declare i64 @llvm.vscale.i64() +declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) +declare void @llvm.masked.store.nxv4i32.p0(, ptr captures(none), i32 immarg, ) +declare i1 @cmp_fn(i64) + +attributes #0 = { "target-features"="+sve2" }