Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2181,8 +2181,8 @@ class LSRInstance {
SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;

void OptimizeShadowIV();
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
void OptimizeLoopTermCond();

void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
Expand Down Expand Up @@ -2416,7 +2416,7 @@ void LSRInstance::OptimizeShadowIV() {

/// If Cond has an operand that is an expression of an IV, set the IV user and
/// stride information and return true, otherwise return false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
for (IVStrideUse &U : IU)
if (U.getUser() == Cond) {
// NOTE: we could handle setcc instructions with multiple uses here, but
Expand Down Expand Up @@ -2476,7 +2476,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
// Check that the loop matches the pattern we're looking for.
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
Cond->getPredicate() != CmpInst::ICMP_NE)
Expand Down Expand Up @@ -2620,15 +2620,26 @@ LSRInstance::OptimizeLoopTermCond() {
// one register value.

BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
if (!TermBr)
if (!TermBr || TermBr->isUnconditional())
continue;
// FIXME: Overly conservative, termination condition could be an 'or' etc..
if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))

Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
bool CondImmediatelyBeforeTerm = Cond && Cond->getNextNode() == TermBr;
// If the argument to TermBr is an extractelement, then the source of that
// instruction is what's generated the condition.
auto *Extract = dyn_cast_or_null<ExtractElementInst>(Cond);
if (Extract) {
Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
if (Cond && CondImmediatelyBeforeTerm)
CondImmediatelyBeforeTerm = Cond->getNextNode() == Extract;
}
// FIXME: We could do more here, like handling logical operations where one
// side is a cmp that uses an induction variable.
if (!Cond)
continue;

// Search IVUsesByStride to find Cond's IVUse if there is one.
IVStrideUse *CondUse = nullptr;
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
if (!FindIVUserForCond(Cond, CondUse))
continue;

Expand All @@ -2638,7 +2649,8 @@ LSRInstance::OptimizeLoopTermCond() {
// One consequence of doing this now is that it disrupts the count-down
// optimization. That's not always a bad thing though, because in such
// cases it may still be worthwhile to avoid a max.
Cond = OptimizeMax(Cond, CondUse);
if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
Cond = OptimizeMax(Cmp, CondUse);

// If this exiting block dominates the latch block, it may also use
// the post-inc value if it won't be shared with other uses.
Expand Down Expand Up @@ -2703,13 +2715,13 @@ LSRInstance::OptimizeLoopTermCond() {
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
// the exiting block branch, move it.
if (Cond->getNextNode() != TermBr) {
if (!CondImmediatelyBeforeTerm && isa<CmpInst>(Cond) && !Extract) {
if (Cond->hasOneUse()) {
Cond->moveBefore(TermBr->getIterator());
} else {
// Clone the terminating condition and insert into the loopend.
ICmpInst *OldCond = Cond;
Cond = cast<ICmpInst>(Cond->clone());
Instruction *OldCond = Cond;
Cond = Cond->clone();
Cond->setName(L->getHeader()->getName() + ".termcond");
Cond->insertInto(ExitingBlock, TermBr->getIterator());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,32 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: cntd x9
; CHECK-NEXT: rdvl x10, #2
; CHECK-NEXT: mov w9, #100 // =0x64
; CHECK-NEXT: whilelo p1.d, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntd x10
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov x11, x9
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
; CHECK-NEXT: mov z6.d, z0.d
; CHECK-NEXT: mov z7.d, z1.d
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: add x1, x1, x11
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z1.d, p2/m, z7.d
; CHECK-NEXT: mov z0.d, p1/m, z6.d
; CHECK-NEXT: whilelo p1.d, x11, x8
; CHECK-NEXT: add x11, x11, x9
; CHECK-NEXT: whilelo p1.d, x8, x9
; CHECK-NEXT: b.mi .LBB0_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
Expand Down Expand Up @@ -213,19 +213,18 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: mov w8, #100 // =0x64
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: cntd x9
; CHECK-NEXT: rdvl x10, #2
; CHECK-NEXT: mov w9, #100 // =0x64
; CHECK-NEXT: whilelo p1.d, xzr, x9
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cntd x10
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cnth x11
; CHECK-NEXT: mov x12, x9
; CHECK-NEXT: rdvl x11, #2
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x8, lsl #2]
; CHECK-NEXT: mov z6.d, z0.d
; CHECK-NEXT: mov z7.d, z1.d
; CHECK-NEXT: add x2, x2, x11
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: and z2.d, z2.d, #0xffffffff
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
Expand All @@ -234,16 +233,15 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: add x1, x1, x11
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
; CHECK-NEXT: mov z1.d, p2/m, z7.d
; CHECK-NEXT: mov z0.d, p1/m, z6.d
; CHECK-NEXT: whilelo p1.d, x12, x8
; CHECK-NEXT: add x12, x12, x9
; CHECK-NEXT: whilelo p1.d, x8, x9
; CHECK-NEXT: b.mi .LBB2_1
; CHECK-NEXT: // %bb.2: // %exit.block
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
Expand Down
205 changes: 205 additions & 0 deletions llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -loop-reduce %s -S -o - | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

; Tests where the loop termination condition is not generated by a compare.

; The call to get.active.lane.mask in the loop should use the postincrement
; value of %index.
define void @lane_mask(ptr %dst, i64 %n) #0 {
; CHECK-LABEL: define void @lane_mask(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP1]] = add i64 [[IV]], [[VSCALEX4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]])
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
%vscale = tail call i64 @llvm.vscale.i64()
%vscalex4 = shl i64 %vscale, 2
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
%iv.next = add i64 %iv, %vscalex4
%active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
%cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
br i1 %cond, label %loop, label %exit

exit:
ret void
}

; The store between the call and the branch shouldn't prevent the
; postincement value from being used.
define void @lane_mask_not_last(ptr %dst, i64 %n) #0 {
; CHECK-LABEL: define void @lane_mask_not_last(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[IV]], 2
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP0]] = add i64 [[IV]], [[VSCALEX4]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP0]], i64 [[N]])
; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
%vscale = tail call i64 @llvm.vscale.i64()
%vscalex4 = shl i64 %vscale, 2
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %loop ]
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
%iv.next = add i64 %iv, %vscalex4
%active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %iv.next, i64 %n)
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
%cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
br i1 %cond, label %loop, label %exit

exit:
ret void
}

; The call to cmp_fn in the loop should use the postincrement value of %iv.
define void @uses_cmp_fn(ptr %dst, i64 %n) {
; CHECK-LABEL: define void @uses_cmp_fn(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[LSR_IV]], 2
; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV_NEXT]])
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
store i32 0, ptr %gep, align 4
%iv.next = add i64 %iv, 1
%cond = tail call i1 @cmp_fn(i64 %iv.next)
br i1 %cond, label %loop, label %exit

exit:
ret void
}

; The store between the call and the branch shouldn't prevent the
; postincement value from being used.
define void @uses_cmp_fn_not_last(ptr %dst, i64 %n) {
; CHECK-LABEL: define void @uses_cmp_fn_not_last(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2
; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: [[LSR_IV]] = add i64 [[IV]], 1
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]])
; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
%iv.next = add i64 %iv, 1
%cond = tail call i1 @cmp_fn(i64 %iv.next)
store i32 0, ptr %gep, align 4
br i1 %cond, label %loop, label %exit

exit:
ret void
}

; cmp2 will use a preincrement induction variable as it isn't directly the loop
; termination condition.
; FIXME: We could potentially handle this by examining the operands of the 'and'
; instruction.
define void @cmp_and(ptr %dst, i64 %n) {
; CHECK-LABEL: define void @cmp_and(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[LOOP]] ], [ [[DST]], %[[ENTRY]] ]
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ [[TMP0]], %[[ENTRY]] ]
; CHECK-NEXT: [[VAL:%.*]] = load i64, ptr [[LSR_IV1]], align 8
; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i64 [[VAL]], [[N]]
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[LSR_IV]], 0
; CHECK-NEXT: [[COND:%.*]] = and i1 [[CMP1]], [[CMP2]]
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep = getelementptr inbounds nuw i32, ptr %dst, i64 %iv
%val = load i64, ptr %gep, align 8
%iv.next = add i64 %iv, 1
%cmp1 = icmp ne i64 %val, %n
%cmp2 = icmp ne i64 %iv.next, %n
%cond = and i1 %cmp1, %cmp2
br i1 %cond, label %loop, label %exit

exit:
ret void
}


declare i64 @llvm.vscale.i64()
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>)
declare i1 @cmp_fn(i64)

attributes #0 = { "target-features"="+sve2" }