66 changes: 18 additions & 48 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
/// during the transform and pseudo instructions are replaced by real ones. In
/// some cases, when we have to revert to a 'normal' loop, we have to introduce
/// multiple instructions for a single pseudo (see RevertWhile and
/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
/// are defined to be as large as this maximum sequence of replacement
/// instructions.
///
Expand Down Expand Up @@ -102,7 +102,7 @@ static bool shouldInspect(MachineInstr &MI) {
}

static bool isDo(MachineInstr *MI) {
return MI->getOpcode() != ARM::t2WhileLoopStart;
return MI->getOpcode() != ARM::t2WhileLoopStartLR;
}

namespace {
Expand Down Expand Up @@ -442,7 +442,7 @@ namespace {
MachineOperand &getLoopStartOperand() {
if (IsTailPredicationLegal())
return TPNumElements;
return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
return Start->getOperand(1);
}

unsigned getStartOpcode() const {
Expand Down Expand Up @@ -1064,53 +1064,20 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
return false;
}

if (Start->getOpcode() == ARM::t2WhileLoopStart &&
if (Start->getOpcode() == ARM::t2WhileLoopStartLR &&
(BBUtils->getOffsetOf(Start) >
BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
!BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
BBUtils->getOffsetOf(Start->getOperand(2).getMBB()) ||
!BBUtils->isBBInRange(Start, Start->getOperand(2).getMBB(), 4094))) {
LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
return false;
}
return true;
};

// Find a suitable position to insert the loop start instruction. It needs to
// be able to safely define LR.
auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
MachineBasicBlock::iterator &InsertPt,
MachineBasicBlock *&InsertBB,
ReachingDefAnalysis &RDA,
InstSet &ToRemove) {
// For a t2DoLoopStart it is always valid to use the start insertion point.
// For WLS we can define LR if LR already contains the same value.
if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
InsertPt = MachineBasicBlock::iterator(Start);
InsertBB = Start->getParent();
return true;
}

// We've found no suitable LR def and Start doesn't use LR directly. Can we
// just define LR anyway?
if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
return false;

InsertPt = MachineBasicBlock::iterator(Start);
InsertBB = Start->getParent();
return true;
};

if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
ToRemove)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
Revert = true;
return;
}
LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
else
dbgs() << "ARM Loops: Will insert LoopStart at "
<< *StartInsertPt
);
StartInsertPt = MachineBasicBlock::iterator(Start);
StartInsertBB = Start->getParent();
LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
<< *StartInsertPt);

Revert = !ValidateRanges(Start, End, BBUtils, ML);
CannotTailPredicate = !ValidateTailPredicate();
Expand Down Expand Up @@ -1317,6 +1284,9 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
return false;
}

assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
"Expected t2WhileLoopStart to be removed before regalloc!");

// Check that the only instruction using LoopDec is LoopEnd. This can only
// happen when the Dec and End are separate, not a single t2LoopEndDec.
// TODO: Check for copy chains that really have no effect.
Expand All @@ -1339,11 +1309,11 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;

RevertWhileLoopStart(MI, TII, BrOpc);
RevertWhileLoopStartLR(MI, TII, BrOpc);
}

void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
Expand Down Expand Up @@ -1478,7 +1448,7 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
MIB.addDef(ARM::LR);
MIB.add(Count);
if (!isDo(Start))
MIB.add(Start->getOperand(1));
MIB.add(Start->getOperand(2));

LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
NewStart = &*MIB;
Expand Down Expand Up @@ -1657,7 +1627,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
};

if (LoLoop.Revert) {
if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStartLR)
RevertWhile(LoLoop.Start);
else
RevertDo(LoLoop.Start);
Expand Down Expand Up @@ -1728,7 +1698,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() {
Changed = true;

for (auto *Start : Starts) {
if (Start->getOpcode() == ARM::t2WhileLoopStart)
if (Start->getOpcode() == ARM::t2WhileLoopStartLR)
RevertWhile(Start);
else
RevertDo(Start);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1870,7 +1870,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
default:
break;
case Intrinsic::start_loop_iterations:
case Intrinsic::test_set_loop_iterations:
case Intrinsic::test_start_loop_iterations:
case Intrinsic::loop_decrement:
case Intrinsic::loop_decrement_reg:
return true;
Expand Down
124 changes: 118 additions & 6 deletions llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
}

private:
bool LowerWhileLoopStart(MachineLoop *ML);
bool MergeLoopEnd(MachineLoop *ML);
bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
Expand All @@ -74,6 +75,7 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
bool ConvertVPSEL(MachineBasicBlock &MBB);
bool HintDoLoopStartReg(MachineBasicBlock &MBB);
};

char MVETPAndVPTOptimisations::ID = 0;
Expand Down Expand Up @@ -164,7 +166,9 @@ static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
? LoopPhi->getOperand(3).getReg()
: LoopPhi->getOperand(1).getReg();
LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) {
if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
return false;
}
Expand All @@ -173,6 +177,82 @@ static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
return true;
}

static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
MachineBasicBlock *MBB = MI->getParent();
assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
"Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");

// Subs
MachineInstrBuilder MIB =
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
MIB.add(MI->getOperand(0));
MIB.add(MI->getOperand(1));
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
MIB.addReg(ARM::NoRegister);
MIB.addReg(ARM::CPSR, RegState::Define);

// Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
for (MachineInstr &I : MBB->terminators()) {
if (I.getOpcode() == ARM::t2WhileLoopStart) {
MachineInstrBuilder MIB =
BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
MIB.add(MI->getOperand(1)); // branch target
MIB.addImm(ARMCC::EQ);
MIB.addReg(ARM::CPSR);
I.eraseFromParent();
break;
}
}

MI->eraseFromParent();
}

// The Hardware Loop insertion and ISel Lowering produce the pseudos for the
// start of a while loop:
// %a:gprlr = t2WhileLoopSetup %Cnt
// t2WhileLoopStart %a, %BB
// We want to convert those to a single instruction which, like t2LoopEndDec and
// t2DoLoopStartTP is both a terminator and produces a value:
// %a:grplr: t2WhileLoopStartLR %Cnt, %BB
//
// Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
// t2WhileLoopStart are not valid past regalloc.
bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
<< ML->getHeader()->getName() << "\n");

MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
return false;

if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
return false;

Register LR = LoopStart->getOperand(0).getReg();
auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
return MI.getOpcode() == ARM::t2WhileLoopStart;
});
if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
RevertWhileLoopSetup(LoopStart, TII);
RevertLoopDec(LoopStart, TII);
RevertLoopEnd(LoopStart, TII);
return true;
}

MachineInstrBuilder MI =
BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
TII->get(ARM::t2WhileLoopStartLR), LR)
.add(LoopStart->getOperand(1))
.add(WLSIt->getOperand(1));
(void)MI;
LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());

WLSIt->eraseFromParent();
LoopStart->eraseFromParent();
return true;
}

// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
// will be valid to be used for the low overhead loop, which means nothing else
Expand All @@ -192,12 +272,19 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
return false;

// Check if there is an illegal instruction (a call) in the low overhead loop
// and if so revert it now before we get any further.
for (MachineBasicBlock *MBB : ML->blocks()) {
// and if so revert it now before we get any further. While loops also need to
// check the preheaders.
SmallPtrSet<MachineBasicBlock *, 4> MBBs(ML->block_begin(), ML->block_end());
if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR)
MBBs.insert(ML->getHeader()->pred_begin(), ML->getHeader()->pred_end());
for (MachineBasicBlock *MBB : MBBs) {
for (MachineInstr &MI : *MBB) {
if (MI.isCall()) {
LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
RevertDoLoopStart(LoopStart, TII);
if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
RevertDoLoopStart(LoopStart, TII);
else
RevertWhileLoopStartLR(LoopStart, TII);
RevertLoopDec(LoopDec, TII);
RevertLoopEnd(LoopEnd, TII);
return true;
Expand Down Expand Up @@ -236,8 +323,16 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
};
if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
!CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
!CheckUsers(StartReg, {LoopPhi}, MRI))
!CheckUsers(StartReg, {LoopPhi}, MRI)) {
// Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
RevertWhileLoopStartLR(LoopStart, TII);
RevertLoopDec(LoopDec, TII);
RevertLoopEnd(LoopEnd, TII);
return true;
}
return false;
}

MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
Expand Down Expand Up @@ -281,7 +376,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
return false;
if (LoopDec != LoopEnd)
if (LoopDec != LoopEnd || LoopStart->getOpcode() != ARM::t2DoLoopStart)
return false;

SmallVector<MachineInstr *, 4> VCTPs;
Expand Down Expand Up @@ -852,6 +947,21 @@ bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
return !DeadInstructions.empty();
}

// Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
// the instruction may be removable as a noop.
bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
bool Changed = false;
for (MachineInstr &MI : MBB.instrs()) {
if (MI.getOpcode() != ARM::t2DoLoopStart)
continue;
Register R = MI.getOperand(1).getReg();
MachineFunction *MF = MI.getParent()->getParent();
MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
Changed = true;
}
return Changed;
}

bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
Expand All @@ -869,11 +979,13 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {

bool Modified = false;
for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
Modified |= LowerWhileLoopStart(ML);
Modified |= MergeLoopEnd(ML);
Modified |= ConvertTailPredLoop(ML, DT);
}

for (MachineBasicBlock &MBB : Fn) {
Modified |= HintDoLoopStartReg(MBB);
Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
Expand Down
19 changes: 12 additions & 7 deletions llvm/lib/Target/ARM/MVETailPredUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,26 +71,31 @@ static inline bool isVCTP(const MachineInstr *MI) {
static inline bool isLoopStart(MachineInstr &MI) {
return MI.getOpcode() == ARM::t2DoLoopStart ||
MI.getOpcode() == ARM::t2DoLoopStartTP ||
MI.getOpcode() == ARM::t2WhileLoopStart;
MI.getOpcode() == ARM::t2WhileLoopStart ||
MI.getOpcode() == ARM::t2WhileLoopStartLR;
}

// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
// WhileLoopStart holds the exit block, so produce a subs Op0, Op1, 0 and then a
// beq that branches to the exit branch.
inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII,
unsigned BrOpc = ARM::t2Bcc) {
inline void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII,
unsigned BrOpc = ARM::t2Bcc) {
MachineBasicBlock *MBB = MI->getParent();
assert(MI->getOpcode() == ARM::t2WhileLoopStartLR &&
"Only expected a t2WhileLoopStartLR in RevertWhileLoopStartLR!");

// Cmp
// Subs
MachineInstrBuilder MIB =
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri));
BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
MIB.add(MI->getOperand(0));
MIB.add(MI->getOperand(1));
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
MIB.addReg(ARM::NoRegister);
MIB.addReg(ARM::CPSR, RegState::Define);

// Branch
MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
MIB.add(MI->getOperand(1)); // branch target
MIB.add(MI->getOperand(2)); // branch target
MIB.addImm(ARMCC::EQ); // condition code
MIB.addReg(ARM::CPSR);

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/MVETailPredication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {

Intrinsic::ID ID = Call->getIntrinsicID();
if (ID == Intrinsic::start_loop_iterations ||
ID == Intrinsic::test_set_loop_iterations)
ID == Intrinsic::test_start_loop_iterations)
return cast<IntrinsicInst>(&I);
}
return nullptr;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ body: |
renamable $r12 = t2LDRi12 $sp, 48, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
renamable $r5 = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $r7, dead $cpsr = tLSRri killed renamable $r5, 2, 14, $noreg
t2WhileLoopStart renamable $r7, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $r7, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
bb.1.for.body.lr.ph:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ body: |
; CHECK: liveins: $r1, $r2, $r3, $r5, $r7, $r8, $r12
; CHECK: $r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20)
; CHECK: $r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24)
; CHECK: t2CMPri renamable $r8, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: dead $lr = t2SUBri renamable $r8, 0, 14 /* CC::al */, $noreg, def $cpsr
; CHECK: tBcc %bb.1, 0 /* CC::eq */, killed $cpsr
; CHECK: tB %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.3.bb27:
Expand Down Expand Up @@ -334,7 +334,7 @@ body: |
$r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20)
$r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24)
t2WhileLoopStart renamable $r8, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $r8, %bb.1, implicit-def dead $cpsr
tB %bb.3, 14 /* CC::al */, $noreg
bb.3.bb27:
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ body: |
$r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20)
$r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24)
t2WhileLoopStart renamable $r8, %bb.5, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $r8, %bb.5, implicit-def dead $cpsr
tB %bb.2, 14 /* CC::al */, $noreg
bb.2.bb27:
Expand Down
53 changes: 28 additions & 25 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/branch-targets.ll
Original file line number Diff line number Diff line change
Expand Up @@ -402,17 +402,18 @@ for.cond.cleanup:
}

; CHECK-MID: check_negated_xor_wls
; CHECK-MID: t2WhileLoopStart $r2, %bb.3
; CHECK-MID: $lr = t2WhileLoopStartLR killed renamable $r2
; CHECK-MID: tB %bb.1
; CHECK-MID: bb.1.while.body.preheader:
; CHECK-MID: $lr = t2LoopDec killed renamable $lr, 1
; CHECK-MID: t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
; CHECk-MID: tB %bb.3
; CHECK-MID: bb.3.while.end:
; CHECK-MID: bb.1.while.body:
; CHECK-MID: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1
; CHECk-MID: tB %bb.2
; CHECK-MID: bb.2.while.end:
define void @check_negated_xor_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
entry:
%wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
%xor = xor i1 %wls, 1
%wls = call {i32, i1} @llvm.test.start.loop.iterations.i32(i32 %N)
%wls0 = extractvalue {i32, i1} %wls, 0
%wls1 = extractvalue {i32, i1} %wls, 1
%xor = xor i1 %wls1, 1
br i1 %xor, label %while.end, label %while.body.preheader

while.body.preheader:
Expand All @@ -421,7 +422,7 @@ while.body.preheader:
while.body:
%a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
%b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
%count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
%count = phi i32 [ %wls0, %while.body.preheader ], [ %count.next, %while.body ]
%incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
%ld.b = load i16, i16* %b.addr.05, align 2
%incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
Expand All @@ -435,17 +436,18 @@ while.end:
}

; CHECK-MID: check_negated_cmp_wls
; CHECK-MID: t2WhileLoopStart $r2, %bb.3
; CHECK-MID: $lr = t2WhileLoopStartLR killed renamable $r2
; CHECK-MID: tB %bb.1
; CHECK-MID: bb.1.while.body.preheader:
; CHECK-MID: $lr = t2LoopDec killed renamable $lr, 1
; CHECK-MID: t2LoopEnd renamable $lr, %bb.2
; CHECk-MID: tB %bb.3
; CHECK-MID: bb.3.while.end:
; CHECK-MID: bb.1.while.body:
; CHECK-MID: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1
; CHECk-MID: tB %bb.2
; CHECK-MID: bb.2.while.end:
define void @check_negated_cmp_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
entry:
%wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
%cmp = icmp ne i1 %wls, 1
%wls = call {i32, i1} @llvm.test.start.loop.iterations.i32(i32 %N)
%wls0 = extractvalue {i32, i1} %wls, 0
%wls1 = extractvalue {i32, i1} %wls, 1
%cmp = icmp ne i1 %wls1, 1
br i1 %cmp, label %while.end, label %while.body.preheader

while.body.preheader:
Expand All @@ -454,7 +456,7 @@ while.body.preheader:
while.body:
%a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
%b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
%count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
%count = phi i32 [ %wls0, %while.body.preheader ], [ %count.next, %while.body ]
%incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
%ld.b = load i16, i16* %b.addr.05, align 2
%incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
Expand All @@ -468,11 +470,10 @@ while.end:
}

; CHECK-MID: check_negated_reordered_wls
; CHECK-MID: t2WhileLoopStart killed $r2, %bb.2
; CHECK-MID: $lr = t2WhileLoopStartLR killed renamable $r2
; CHECK-MID: tB %bb.1
; CHECK-MID: bb.1.while.body:
; CHECK-MID: $lr = t2LoopDec killed renamable $lr, 1
; CHECK-MID: t2LoopEnd renamable $lr, %bb.1
; CHECK-MID: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1
; CHECk-MID: tB %bb.2
; CHECK-MID: bb.2.while.end:
define void @check_negated_reordered_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
Expand All @@ -485,7 +486,7 @@ while.body.preheader:
while.body:
%a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
%b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
%count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
%count = phi i32 [ %wls0, %while.body.preheader ], [ %count.next, %while.body ]
%incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
%ld.b = load i16, i16* %b.addr.05, align 2
%incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
Expand All @@ -495,14 +496,16 @@ while.body:
br i1 %cmp, label %while.body, label %while.end

while:
%wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
%xor = xor i1 %wls, 1
%wls = call {i32, i1} @llvm.test.start.loop.iterations.i32(i32 %N)
%wls0 = extractvalue {i32, i1} %wls, 0
%wls1 = extractvalue {i32, i1} %wls, 1
%xor = xor i1 %wls1, 1
br i1 %xor, label %while.end, label %while.body.preheader

while.end:
ret void
}

declare i32 @llvm.start.loop.iterations.i32(i32)
declare i1 @llvm.test.set.loop.iterations.i32(i32)
declare {i32, i1} @llvm.test.start.loop.iterations.i32(i32)
declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: beq .LBB0_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
; CHECK-NEXT: subs r5, r3, #1
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: and r12, r3, #3
; CHECK-NEXT: cmp r5, #3
; CHECK-NEXT: bhs .LBB0_6
; CHECK-NEXT: @ %bb.3:
Expand All @@ -44,7 +44,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: letp lr, .LBB0_5
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new
; CHECK-NEXT: sub.w r12, r3, lr
; CHECK-NEXT: sub.w lr, r3, r12
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: .LBB0_7: @ %for.body
Expand All @@ -56,7 +56,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: vldr s0, [r5]
; CHECK-NEXT: adds r4, #16
; CHECK-NEXT: vldr s2, [r6]
; CHECK-NEXT: cmp r12, r3
; CHECK-NEXT: cmp lr, r3
; CHECK-NEXT: vmul.f32 s0, s2, s0
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: vldr s0, [r5, #4]
Expand All @@ -73,7 +73,7 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: vstr s0, [r7, #12]
; CHECK-NEXT: bne .LBB0_7
; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, lr, .LBB0_11
; CHECK-NEXT: wls lr, r12, .LBB0_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r1, r1, r3, lsl #2
; CHECK-NEXT: add.w r2, r2, r3, lsl #2
Expand Down Expand Up @@ -219,8 +219,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: add.w lr, r12, r3, lsr #2
; CHECK-NEXT: add.w r12, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ if.end: ; preds = %while.body, %while.
; CHECK: be_ne
; CHECK: body:
; CHECK: bb.0.entry:
; CHECK: $lr = t2DLS killed renamable $r12
; CHECK: $lr =
; CHECK: bb.2.do.body:
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
Expand All @@ -179,13 +179,11 @@ if.end: ; preds = %do.body, %entry
ret void
}

; TODO: Remove the tMOVr in the preheader!
; CHECK: ne_trip_count
; CHECK: body:
; CHECK: bb.0.entry:
; CHECK: $lr = t2WLS $r3, %bb.3
; CHECK: $lr = t2WLS killed renamable $r3, %bb.3
; CHECK: bb.1.do.body.preheader:
; CHECK: $lr = tMOVr
; CHECK: bb.2.do.body:
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float
; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB0_7
; CHECK-NEXT: and r7, r3, #3
; CHECK-NEXT: wls lr, r7, .LBB0_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
Expand Down Expand Up @@ -246,8 +246,8 @@ define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float
; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB1_7
; CHECK-NEXT: and r7, r3, #3
; CHECK-NEXT: wls lr, r7, .LBB1_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
Expand Down Expand Up @@ -459,8 +459,8 @@ define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float
; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: adds r4, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB2_7
; CHECK-NEXT: and r7, r3, #3
; CHECK-NEXT: wls lr, r7, .LBB2_7
; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
Expand Down Expand Up @@ -681,8 +681,8 @@ define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* no
; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16
; CHECK-NEXT: mvn.w r7, r12
; CHECK-NEXT: add.w r8, r7, r3
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: wls lr, lr, .LBB3_10
; CHECK-NEXT: and r7, r3, #3
; CHECK-NEXT: wls lr, r7, .LBB3_10
; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader
; CHECK-NEXT: add.w r5, r0, r12, lsl #2
; CHECK-NEXT: add.w r6, r1, r12, lsl #2
Expand Down Expand Up @@ -1424,7 +1424,7 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
; CHECK-NEXT: cbz r2, .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and lr, r2, #3
; CHECK-NEXT: and r12, r2, #3
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB9_4
Expand All @@ -1435,7 +1435,7 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
; CHECK-NEXT: sub.w r12, r2, lr
; CHECK-NEXT: sub.w lr, r2, r12
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: .LBB9_5: @ %for.body
Expand All @@ -1459,15 +1459,15 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vmul.f16 s8, s10, s8
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: cmp lr, r2
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: bne .LBB9_5
; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, lr, .LBB9_9
; CHECK-NEXT: wls lr, r12, .LBB9_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
Expand Down Expand Up @@ -1576,7 +1576,7 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
; CHECK-NEXT: cbz r2, .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and lr, r2, #3
; CHECK-NEXT: and r12, r2, #3
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB10_4
Expand All @@ -1587,7 +1587,7 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
; CHECK-NEXT: sub.w r12, r2, lr
; CHECK-NEXT: sub.w lr, r2, r12
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: .LBB10_5: @ %for.body
Expand All @@ -1611,15 +1611,15 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vadd.f16 s8, s10, s8
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: cmp lr, r2
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: bne .LBB10_5
; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, lr, .LBB10_9
; CHECK-NEXT: wls lr, r12, .LBB10_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
Expand Down Expand Up @@ -1728,7 +1728,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: cbz r2, .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and lr, r2, #3
; CHECK-NEXT: and r12, r2, #3
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB11_4
Expand All @@ -1739,7 +1739,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
; CHECK-NEXT: sub.w r12, r2, lr
; CHECK-NEXT: sub.w lr, r2, r12
; CHECK-NEXT: adds r3, r1, #4
; CHECK-NEXT: adds r4, r0, #4
; CHECK-NEXT: movs r2, #0
Expand All @@ -1748,7 +1748,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: ldrsh.w r5, [r3, #2]
; CHECK-NEXT: vldr.16 s2, [r4, #2]
; CHECK-NEXT: adds r2, #4
; CHECK-NEXT: cmp r12, r2
; CHECK-NEXT: cmp lr, r2
; CHECK-NEXT: vmov s4, r5
; CHECK-NEXT: ldrsh r5, [r3], #8
; CHECK-NEXT: vcvt.f16.s32 s4, s4
Expand Down Expand Up @@ -1778,7 +1778,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, lr, .LBB11_9
; CHECK-NEXT: wls lr, r12, .LBB11_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r2, lsl #1
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
Expand Down
15 changes: 10 additions & 5 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -90,8 +91,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -165,8 +167,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -240,8 +243,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -315,8 +319,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ body: |
frame-setup CFI_INSTRUCTION offset $r7, -8
renamable $r3, dead $cpsr = tADDi3 renamable $r2, 7, 14, $noreg
renamable $lr = t2LSRri killed renamable $r3, 3, 14, $noreg, $noreg
t2WhileLoopStart renamable $lr, %bb.4, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $lr, %bb.4, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
bb.1.for.body.preheader:
Expand Down
35 changes: 22 additions & 13 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -145,7 +146,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #4
; CHECK-NEXT: add.w r3, r3, r12, lsr #4
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
Expand Down Expand Up @@ -214,7 +216,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -285,7 +288,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #4
; CHECK-NEXT: add.w r3, r3, r12, lsr #4
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
Expand Down Expand Up @@ -354,7 +358,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #3
; CHECK-NEXT: add.w r3, r3, r12, lsr #3
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -413,19 +418,20 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
; CHECK-LABEL: two_loops_mul_add_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq .LBB6_8
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: subs r6, r3, #4
; CHECK-NEXT: subs r7, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: add.w lr, r3, r6, lsr #2
; CHECK-NEXT: add.w r6, r3, r7, lsr #2
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
Expand All @@ -445,8 +451,9 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q0, r3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r6, lsr #2
; CHECK-NEXT: add.w r3, r3, r7, lsr #2
; CHECK-NEXT: vmov.32 q0[0], r12
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB6_5: @ %vector.body46
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand All @@ -463,10 +470,10 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: vaddv.u32 r12, q0
; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup7
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB6_8:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp35 = icmp eq i32 %N, 0
br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
Expand Down Expand Up @@ -548,9 +555,10 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: subs r3, #8
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: add.w lr, r4, r3, lsr #3
; CHECK-NEXT: add.w r12, r4, r3, lsr #3
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r4, r1
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB7_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
Expand Down Expand Up @@ -668,7 +676,8 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w lr, r12, r0, lsr #2
; CHECK-NEXT: add.w r0, r12, r0, lsr #2
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-non-loop.mir
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# CHECK: bb.0.entry:
# CHECK: tBcc %bb.2, 3
# CHECK: bb.1.not.preheader:
# CHECK: t2CMPri renamable $lr, 0, 14
# CHECK: $lr = t2SUBri killed renamable $lr, 0, 14
# CHECK: tBcc %bb.4, 0
# CHECK: tB %bb.2
# CHECK: bb.3.while.body:
Expand Down Expand Up @@ -119,7 +119,7 @@ body: |
successors: %bb.2(0x40000000), %bb.4(0x40000000)
liveins: $lr, $r0, $r1
t2WhileLoopStart renamable $lr, %bb.4, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $lr, %bb.4, implicit-def dead $cpsr
tB %bb.2, 14, $noreg
bb.2.while.body.preheader:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ body: |
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: t2CMPri $r3, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: dead $lr = t2SUBri $r3, 0, 14 /* CC::al */, $noreg, def $cpsr
; CHECK: t2Bcc %bb.3, 0 /* CC::eq */, killed $cpsr
; CHECK: tB %bb.1, 14 /* CC::al */, $noreg
; CHECK: bb.1.do.body.preheader:
Expand Down Expand Up @@ -130,7 +130,7 @@ body: |
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
t2WhileLoopStart $r3, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR $r3, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
bb.1.do.body.preheader:
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
; CHECK-NEXT: subs r3, r2, r3
; CHECK-NEXT: add.w r12, r3, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -91,8 +92,9 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down Expand Up @@ -161,8 +163,9 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ body: |
renamable $r12 = t2LDRi12 $sp, 44, 14, $noreg :: (load 4 from %fixed-stack.0, align 8)
renamable $r5 = t2ADDri renamable $r12, 3, 14, $noreg, $noreg
renamable $lr = t2LSRri killed renamable $r5, 2, 14, $noreg, $noreg
t2WhileLoopStart renamable $lr, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $lr, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
bb.1.for.body.lr.ph:
Expand Down
117 changes: 55 additions & 62 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmldava_in_vpt.mir
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
entry:
%add = add i32 %block_size, 3
%div = lshr i32 %add, 2
%0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %div)
br i1 %0, label %for.body.lr.ph, label %for.cond.cleanup
%0 = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %0, 0
%wls1 = extractvalue { i32, i1 } %0, 1
br i1 %wls1, label %for.body.lr.ph, label %for.cond.cleanup

for.body.lr.ph: ; preds = %entry
%.splatinsert.i41 = insertelement <4 x i32> undef, i32 %out_activation_min, i32 0
Expand All @@ -21,7 +23,7 @@
ret i32 %res

for.body: ; preds = %for.body, %for.body.lr.ph
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %div, %for.body.lr.ph ]
%lsr.iv = phi i32 [ %iv.next, %for.body ], [ %wls0, %for.body.lr.ph ]
%input_1_vect.addr.052 = phi i8* [ %input_1_vect, %for.body.lr.ph ], [ %add.ptr, %for.body ]
%input_2_vect.addr.051 = phi i8* [ %input_2_vect, %for.body.lr.ph ], [ %add.ptr14, %for.body ]
%num_elements.049 = phi i32 [ %block_size, %for.body.lr.ph ], [ %sub, %for.body ]
Expand All @@ -47,9 +49,8 @@
%add.ptr = getelementptr inbounds i8, i8* %input_1_vect.addr.052, i32 4
%add.ptr14 = getelementptr inbounds i8, i8* %input_2_vect.addr.051, i32 4
%sub = add i32 %num_elements.049, -4
%iv.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1)
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
%lsr.iv.next = add i32 %lsr.iv, -1
br i1 %cmp, label %for.body, label %for.cond.cleanup
}
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
Expand All @@ -58,8 +59,8 @@
declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1
declare i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>) #1
declare i1 @llvm.test.set.loop.iterations.i32(i32) #4
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #4
declare { i32, i1 } @llvm.test.start.loop.iterations.i32(i32) #4
declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
...
---
name: vmldava_in_vpt
Expand All @@ -82,7 +83,7 @@ frameInfo:
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 20
stackSize: 16
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
Expand Down Expand Up @@ -120,117 +121,109 @@ stack:
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 4, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4,
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
debugValueSubstitutions: []
constants: []
machineFunctionInfo: {}
body: |
; CHECK-LABEL: name: vmldava_in_vpt
; CHECK: bb.0.entry:
; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7
; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 20
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6
; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $lr, implicit-def $sp, implicit $sp
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 16
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -16
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20
; CHECK: renamable $r7 = tLDRspi $sp, 10, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.5)
; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -8
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -12
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -16
; CHECK: renamable $r4 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.5)
; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
; CHECK: $lr = MVE_WLSTP_32 killed renamable $r7, %bb.3
; CHECK: renamable $r5, dead $cpsr = tADDi3 renamable $r4, 3, 14 /* CC::al */, $noreg
; CHECK: dead renamable $r5, dead $cpsr = tLSRri killed renamable $r5, 2, 14 /* CC::al */, $noreg
; CHECK: $lr = MVE_WLSTP_32 killed renamable $r4, %bb.3
; CHECK: bb.1.for.body.lr.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
; CHECK: $r5, $r12 = t2LDRDi8 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.3), (load 4 from %fixed-stack.4, align 8)
; CHECK: renamable $r4 = tLDRspi $sp, 5, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: renamable $r5 = tLDRspi $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8)
; CHECK: $r6, $r12 = t2LDRDi8 $sp, 28, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.3), (load 4 from %fixed-stack.4, align 8)
; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q1
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q1
; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
; CHECK: bb.2.for.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r12
; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r5, $r12
; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4)
; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4)
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3
; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2
; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VMLAS_qr_u32 killed renamable $q3, killed renamable $q2, renamable $r5, 0, $noreg
; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q3, renamable $q1, 0, $noreg, undef renamable $q2
; CHECK: renamable $q3 = MVE_VMINu32 renamable $q2, renamable $q0, 0, $noreg, undef renamable $q3
; CHECK: renamable $r12 = MVE_VMLADAVas32 killed renamable $r12, killed renamable $q3, killed renamable $q2, 0, killed $noreg
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: liveins: $r12
; CHECK: $r0 = tMOVr killed $r12, 14 /* CC::al */, $noreg
; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $pc, implicit killed $r0
bb.0.entry:
successors: %bb.1(0x40000000), %bb.3(0x40000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $lr
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $lr
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 20
frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
frame-setup CFI_INSTRUCTION offset $r6, -12
frame-setup CFI_INSTRUCTION offset $r5, -16
frame-setup CFI_INSTRUCTION offset $r4, -20
renamable $r7 = tLDRspi $sp, 10, 14, $noreg :: (load 4 from %fixed-stack.0)
renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
renamable $r4, dead $cpsr = tADDi3 renamable $r7, 3, 14, $noreg
renamable $r5, dead $cpsr = tLSRri killed renamable $r4, 2, 14, $noreg
t2WhileLoopStart renamable $r5, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
frame-setup CFI_INSTRUCTION offset $r6, -8
frame-setup CFI_INSTRUCTION offset $r5, -12
frame-setup CFI_INSTRUCTION offset $r4, -16
renamable $r4 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0)
renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
renamable $r5, dead $cpsr = tADDi3 renamable $r4, 3, 14 /* CC::al */, $noreg
renamable $r5, dead $cpsr = tLSRri killed renamable $r5, 2, 14 /* CC::al */, $noreg
renamable $lr = t2WhileLoopStartLR killed renamable $r5, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14 /* CC::al */, $noreg
bb.1.for.body.lr.ph:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r5, $r7
liveins: $lr, $r0, $r1, $r2, $r3, $r4
$r6 = tMOVr killed $r5, 14, $noreg
$r5, $r12 = t2LDRDi8 $sp, 32, 14, $noreg :: (load 4 from %fixed-stack.2), (load 4 from %fixed-stack.1, align 8)
renamable $r4 = tLDRspi $sp, 5, 14, $noreg :: (load 4 from %fixed-stack.5, align 8)
renamable $r5 = tLDRspi $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.5, align 8)
$r6, $r12 = t2LDRDi8 $sp, 28, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.2), (load 4 from %fixed-stack.1, align 8)
renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
renamable $q1 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q1
renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
renamable $q1 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q1
renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
bb.2.for.body:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
liveins: $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r6, $r7, $r12
liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r12
renamable $vpr = MVE_VCTP32 renamable $r7, 0, $noreg
renamable $vpr = MVE_VCTP32 renamable $r4, 0, $noreg
MVE_VPST 8, implicit $vpr
renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 16 from %ir.input_2_cast, align 4)
MVE_VPST 8, implicit $vpr
renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 16 from %ir.input_1_cast, align 4)
renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2
renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3
$lr = tMOVr $r6, 14, $noreg
renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2
renamable $r6, dead $cpsr = tSUBi8 killed $r6, 1, 14, $noreg
renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2
renamable $r7, dead $cpsr = tSUBi8 killed renamable $r7, 4, 14, $noreg
renamable $r4, dead $cpsr = tSUBi8 killed renamable $r4, 4, 14 /* CC::al */, $noreg
renamable $q3 = MVE_VMLAS_qr_u32 killed renamable $q3, killed renamable $q2, renamable $r5, 0, $noreg
MVE_VPST 2, implicit $vpr
renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2
renamable $q2 = MVE_VMAXu32 killed renamable $q3, renamable $q1, 1, renamable $vpr, undef renamable $q2
renamable $q3 = MVE_VMINu32 renamable $q2, renamable $q0, 1, renamable $vpr, undef renamable $q3
renamable $r12 = MVE_VMLADAVas32 killed renamable $r12, killed renamable $q3, killed renamable $q2, 1, killed renamable $vpr
renamable $lr = t2LoopDec killed renamable $lr, 1
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
renamable $lr = t2LoopEndDec killed renamable $lr, %bb.2, implicit-def dead $cpsr
tB %bb.3, 14 /* CC::al */, $noreg
bb.3.for.cond.cleanup:
liveins: $r12
$r0 = tMOVr killed $r12, 14, $noreg
tPOP_RET 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0
$r0 = tMOVr killed $r12, 14 /* CC::al */, $noreg
frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $pc, implicit killed $r0
...
123 changes: 59 additions & 64 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -164,81 +164,75 @@ define dso_local i32 @b(i32* %c, i32 %d, i32 %e) "frame-pointer"="all" {
; CHECK-NEXT: push.w {r8, r9, r10, r11}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: cmp.w r1, #0
; CHECK-NEXT: beq .LBB2_3
; CHECK-NEXT: b .LBB2_1
; CHECK-NEXT: .LBB2_1: @ %while.body.preheader
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: mvn r2, #1
; CHECK-NEXT: mvn r3, #1
; CHECK-NEXT: @ implicit-def: $r9
; CHECK-NEXT: @ implicit-def: $r10
; CHECK-NEXT: @ implicit-def: $r6
; CHECK-NEXT: @ implicit-def: $r8
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
; CHECK-NEXT: @ implicit-def: $r4
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov lr, r1
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: ldr.w r8, [r10]
; CHECK-NEXT: ldr r1, [r1, #-4]
; CHECK-NEXT: mul r11, r8, r0
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: mul r1, r1, r9
; CHECK-NEXT: adds.w r12, r1, #-2147483648
; CHECK-NEXT: asr.w r5, r1, #31
; CHECK-NEXT: ldr.w r1, [r10]
; CHECK-NEXT: add.w r1, r11, #-2147483648
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: mul r11, r1, r0
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: add.w r3, r11, #-2147483648
; CHECK-NEXT: asrl r12, r5, r3
; CHECK-NEXT: smull r4, r3, r1, r12
; CHECK-NEXT: lsll r4, r3, #30
; CHECK-NEXT: asrs r5, r3, #31
; CHECK-NEXT: mov r4, r3
; CHECK-NEXT: lsll r4, r5, r1
; CHECK-NEXT: lsll r4, r5, #30
; CHECK-NEXT: ldrd r4, r11, [r2]
; CHECK-NEXT: asrs r3, r5, #31
; CHECK-NEXT: asrl r12, r5, r1
; CHECK-NEXT: smull r2, r1, r8, r12
; CHECK-NEXT: lsll r2, r1, #30
; CHECK-NEXT: asrs r5, r1, #31
; CHECK-NEXT: mov r2, r1
; CHECK-NEXT: lsll r2, r5, r8
; CHECK-NEXT: lsll r2, r5, #30
; CHECK-NEXT: ldrd r2, r11, [r3]
; CHECK-NEXT: asrs r1, r5, #31
; CHECK-NEXT: mov r12, r5
; CHECK-NEXT: ldr.w r5, [lr]
; CHECK-NEXT: muls r4, r6, r4
; CHECK-NEXT: mul r5, r5, r9
; CHECK-NEXT: asrs r5, r4, #31
; CHECK-NEXT: muls r2, r6, r2
; CHECK-NEXT: adds r2, #2
; CHECK-NEXT: lsll r12, r1, r2
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r12, #-2147483648
; CHECK-NEXT: ldr r2, [r2]
; CHECK-NEXT: mul r2, r2, r9
; CHECK-NEXT: add.w r9, r9, #4
; CHECK-NEXT: adds r4, #2
; CHECK-NEXT: lsll r12, r3, r4
; CHECK-NEXT: asr.w r4, r8, #31
; CHECK-NEXT: adds.w r3, r8, r5
; CHECK-NEXT: add.w r12, r12, #-2147483648
; CHECK-NEXT: adc.w r4, r4, r5, asr #31
; CHECK-NEXT: smull r5, r6, r11, r6
; CHECK-NEXT: adds.w r3, r3, #-2147483648
; CHECK-NEXT: adc r3, r4, #0
; CHECK-NEXT: asrs r4, r3, #31
; CHECK-NEXT: subs r5, r3, r5
; CHECK-NEXT: sbcs r4, r6
; CHECK-NEXT: adds.w r6, r5, #-2147483648
; CHECK-NEXT: adc r5, r4, #0
; CHECK-NEXT: asrl r6, r5, r12
; CHECK-NEXT: adds r4, r4, r2
; CHECK-NEXT: adc.w r2, r5, r2, asr #31
; CHECK-NEXT: adds.w r5, r4, #-2147483648
; CHECK-NEXT: smull r6, r4, r11, r6
; CHECK-NEXT: adc r2, r2, #0
; CHECK-NEXT: asrs r5, r2, #31
; CHECK-NEXT: subs r6, r2, r6
; CHECK-NEXT: sbcs r5, r4
; CHECK-NEXT: adds.w r6, r6, #-2147483648
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: asrl r6, r5, r1
; CHECK-NEXT: movs r1, #2
; CHECK-NEXT: lsrl r6, r5, #2
; CHECK-NEXT: movs r5, #2
; CHECK-NEXT: str r6, [r5]
; CHECK-NEXT: ldr r5, [r2], #-4
; CHECK-NEXT: mls r1, r5, r1, r3
; CHECK-NEXT: adds.w r8, r1, #-2147483648
; CHECK-NEXT: asr.w r3, r1, #31
; CHECK-NEXT: adc r1, r3, #0
; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
; CHECK-NEXT: lsrl r8, r1, #2
; CHECK-NEXT: rsb.w r1, r8, #0
; CHECK-NEXT: str r6, [r1]
; CHECK-NEXT: ldr r1, [r3], #-4
; CHECK-NEXT: mls r1, r1, r8, r2
; CHECK-NEXT: adds.w r4, r1, #-2147483648
; CHECK-NEXT: asr.w r2, r1, #31
; CHECK-NEXT: adc r1, r2, #0
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT: lsrl r4, r1, #2
; CHECK-NEXT: rsbs r1, r4, #0
; CHECK-NEXT: str r1, [r10, #-4]
; CHECK-NEXT: add.w r10, r10, #4
; CHECK-NEXT: str r1, [r3]
; CHECK-NEXT: mov r1, lr
; CHECK-NEXT: add.w r1, lr, #4
; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: str.w lr, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: bne .LBB2_2
; CHECK-NEXT: b .LBB2_3
; CHECK-NEXT: str r1, [r2]
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop.w {r8, r9, r10, r11}
Expand Down Expand Up @@ -328,20 +322,21 @@ define void @callinpreheader(i32* noalias nocapture readonly %pAngle, i32* nocap
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: subs r6, r2, #0
; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: mov r4, r1
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: wls lr, r2, .LBB3_3
; CHECK-NEXT: mov.w r0, #0
; CHECK-NEXT: beq .LBB3_3
; CHECK-NEXT: @ %bb.1: @ %for.body.ph
; CHECK-NEXT: mov r6, r2
; CHECK-NEXT: bl callee
; CHECK-NEXT: mov lr, r6
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: .LBB3_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr r1, [r5], #4
; CHECK-NEXT: subs r6, #1
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: cbz r6, .LBB3_3
; CHECK-NEXT: le .LBB3_2
; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
; CHECK-NEXT: str r0, [r4]
; CHECK-NEXT: pop {r4, r5, r6, pc}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ body: |
successors: %bb.2(0x40000000), %bb.1(0x40000000)
$r0 = tLDRspi $sp, 7, 14, $noreg :: (load 4 from %stack.0)
t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
tB %bb.2, 14, $noreg
...
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ body: |
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
t2WhileLoopStart $r2, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR $r2, %bb.3, implicit-def dead $cpsr
tB %bb.1, 14, $noreg
bb.1.while.body.preheader:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ body: |
renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg
renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg
t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.3, 14, $noreg
bb.1.vector.ph:
Expand Down Expand Up @@ -345,7 +345,7 @@ body: |
renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg
renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg
t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.2, 14, $noreg
bb.1.vector.body:
Expand Down Expand Up @@ -477,7 +477,7 @@ body: |
renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.4, 14, $noreg
bb.1.vector.ph:
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/Thumb2/block-placement.mir
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ body: |
; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: tB %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.1:
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
Expand All @@ -72,7 +72,7 @@ body: |
successors: %bb.3(0x80000000)
liveins: $r0, $r1, $r2
t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
bb.3:
successors: %bb.3(0x7c000000), %bb.1(0x04000000)
Expand All @@ -97,7 +97,7 @@ body: |
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: t2WhileLoopStart killed renamable $r0, %bb.0, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r0, %bb.0, implicit-def dead $cpsr
; CHECK: bb.3:
; CHECK: successors: %bb.3(0x7c000000), %bb.1(0x04000000)
; CHECK: renamable $r0 = tLDRi renamable $r2, 0, 14 /* CC::al */, $noreg
Expand All @@ -119,7 +119,7 @@ body: |
successors: %bb.3(0x80000000)
liveins: $r0, $r1, $r2
t2WhileLoopStart killed renamable $r0, %bb.0, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r0, %bb.0, implicit-def dead $cpsr
bb.3:
successors: %bb.3(0x7c000000), %bb.1(0x04000000)
Expand All @@ -144,14 +144,14 @@ body: |
; CHECK: successors: %bb.3(0x80000000)
; CHECK: $lr = tMOVr $r0, 14 /* CC::al */, $noreg
; CHECK: renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2WhileLoopStart killed renamable $lr, %bb.1, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $lr, %bb.1, implicit-def dead $cpsr
; CHECK: tB %bb.3, 14 /* CC::al */, $noreg
; CHECK: bb.1:
; CHECK: successors: %bb.4(0x80000000)
; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2IT 11, 8, implicit-def $itstate
; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
; CHECK: t2WhileLoopStart killed renamable $r1, %bb.0, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r1, %bb.0, implicit-def dead $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: bb.3:
; CHECK: successors: %bb.3(0x7c000000), %bb.1(0x04000000)
Expand All @@ -160,7 +160,7 @@ body: |
; CHECK: bb.4:
; CHECK: successors: %bb.5(0x80000000)
; CHECK: renamable $r0 = t2ADDrs killed renamable $r3, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2WhileLoopStart killed renamable $r1, %bb.6, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r1, %bb.6, implicit-def dead $cpsr
; CHECK: bb.5:
; CHECK: successors: %bb.5(0x7c000000), %bb.6(0x04000000)
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.5, implicit-def dead $cpsr
Expand All @@ -182,7 +182,7 @@ body: |
tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2IT 11, 8, implicit-def $itstate
frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
t2WhileLoopStart killed renamable $r1, %bb.0, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r1, %bb.0, implicit-def dead $cpsr
t2B %bb.4, 14 /* CC::al */, $noreg
bb.1:
Expand All @@ -191,7 +191,7 @@ body: |
$lr = tMOVr $r0, 14 /* CC::al */, $noreg
renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
t2WhileLoopStart killed renamable $lr, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $lr, %bb.3, implicit-def dead $cpsr
bb.2:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
Expand All @@ -205,7 +205,7 @@ body: |
liveins: $r1, $r3
renamable $r0 = t2ADDrs killed renamable $r3, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
t2WhileLoopStart killed renamable $r1, %bb.6, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r1, %bb.6, implicit-def dead $cpsr
bb.5:
successors: %bb.5(0x7c000000), %bb.6(0x04000000)
Expand All @@ -232,21 +232,21 @@ body: |
; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
; CHECK: t2IT 11, 8, implicit-def $itstate
; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
; CHECK: t2WhileLoopStart killed renamable $r1, %bb.2, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r1, %bb.2, implicit-def dead $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: $lr = tMOVr $r0, 14 /* CC::al */, $noreg
; CHECK: renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2WhileLoopStart killed renamable $lr, %bb.1, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $lr, %bb.1, implicit-def dead $cpsr
; CHECK: bb.3:
; CHECK: successors: %bb.3(0x7c000000), %bb.1(0x04000000)
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.3, implicit-def dead $cpsr
; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
; CHECK: bb.4:
; CHECK: successors: %bb.5(0x80000000)
; CHECK: renamable $r0 = t2ADDrs killed renamable $r3, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
; CHECK: t2WhileLoopStart killed renamable $r1, %bb.6, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r1, %bb.6, implicit-def dead $cpsr
; CHECK: bb.5:
; CHECK: successors: %bb.5(0x7c000000), %bb.6(0x04000000)
; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.5, implicit-def dead $cpsr
Expand All @@ -268,7 +268,7 @@ body: |
tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
t2IT 11, 8, implicit-def $itstate
frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
t2WhileLoopStart killed renamable $r1, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r1, %bb.1, implicit-def dead $cpsr
t2B %bb.4, 14 /* CC::al */, $noreg
bb.1:
Expand All @@ -277,7 +277,7 @@ body: |
$lr = tMOVr $r0, 14 /* CC::al */, $noreg
renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
t2WhileLoopStart killed renamable $lr, %bb.3, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $lr, %bb.3, implicit-def dead $cpsr
bb.2:
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
Expand All @@ -291,7 +291,7 @@ body: |
liveins: $r1, $r3
renamable $r0 = t2ADDrs killed renamable $r3, renamable $r1, 18, 14 /* CC::al */, $noreg, $noreg
t2WhileLoopStart killed renamable $r1, %bb.6, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r1, %bb.6, implicit-def dead $cpsr
bb.5:
successors: %bb.5(0x7c000000), %bb.6(0x04000000)
Expand All @@ -318,7 +318,7 @@ body: |
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
; CHECK: bb.2:
; CHECK: successors: %bb.3(0x80000000)
; CHECK: t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: $lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
; CHECK: bb.3:
; CHECK: successors: %bb.3(0x7c000000), %bb.1(0x04000000)
; CHECK: renamable $r0 = tLDRi renamable $r2, 0, 14 /* CC::al */, $noreg
Expand All @@ -341,7 +341,7 @@ body: |
successors: %bb.3(0x80000000)
liveins: $r0, $r1, $r2
t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
$lr = t2WhileLoopStartLR killed renamable $r0, %bb.1, implicit-def dead $cpsr
bb.3:
successors: %bb.3(0x7c000000), %bb.1(0x04000000)
Expand Down
157 changes: 78 additions & 79 deletions llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -785,96 +785,96 @@ define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonl
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: ldrh r5, [r0]
; CHECK-NEXT: ldr.w r9, [r0, #4]
; CHECK-NEXT: subs r6, r5, #1
; CHECK-NEXT: ldrh.w r9, [r0]
; CHECK-NEXT: ldr.w r10, [r0, #4]
; CHECK-NEXT: sub.w r6, r9, #1
; CHECK-NEXT: cmp r6, #3
; CHECK-NEXT: bhi .LBB15_6
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: ldr r7, [r0, #8]
; CHECK-NEXT: add.w r4, r9, r6, lsl #1
; CHECK-NEXT: lsr.w lr, r3, #2
; CHECK-NEXT: add.w r4, r10, r6, lsl #1
; CHECK-NEXT: lsrs r5, r3, #2
; CHECK-NEXT: ldrh.w r8, [r7, #6]
; CHECK-NEXT: ldrh.w r12, [r7, #4]
; CHECK-NEXT: ldrh r6, [r7, #2]
; CHECK-NEXT: ldrh r7, [r7]
; CHECK-NEXT: wls lr, lr, .LBB15_5
; CHECK-NEXT: wls lr, r5, .LBB15_5
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: bic r5, r3, #3
; CHECK-NEXT: add.w r10, r9, #2
; CHECK-NEXT: add.w r9, r10, #2
; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
; CHECK-NEXT: add.w r5, r2, r5, lsl #1
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: .LBB15_3: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
; CHECK-NEXT: sub.w r11, r10, #2
; CHECK-NEXT: add.w r5, r10, #2
; CHECK-NEXT: sub.w r11, r9, #2
; CHECK-NEXT: add.w r5, r9, #2
; CHECK-NEXT: vstrb.8 q0, [r4], #8
; CHECK-NEXT: vldrw.u32 q0, [r11]
; CHECK-NEXT: vldrw.u32 q1, [r10]
; CHECK-NEXT: vldrw.u32 q1, [r9]
; CHECK-NEXT: vmul.f16 q0, q0, r7
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vfma.f16 q0, q1, r12
; CHECK-NEXT: vldrw.u32 q1, [r10, #4]
; CHECK-NEXT: add.w r10, r10, #8
; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
; CHECK-NEXT: add.w r9, r9, #8
; CHECK-NEXT: vfma.f16 q0, q1, r8
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: le lr, .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add.w r9, r9, r2, lsl #1
; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB15_5: @ %while.end
; CHECK-NEXT: and lr, r3, #3
; CHECK-NEXT: and r5, r3, #3
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vctp.16 lr
; CHECK-NEXT: vctp.16 r5
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r9]
; CHECK-NEXT: add.w r1, r9, #2
; CHECK-NEXT: vldrw.u32 q0, [r10]
; CHECK-NEXT: add.w r1, r10, #2
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: add.w r1, r9, #6
; CHECK-NEXT: add.w r1, r10, #6
; CHECK-NEXT: vmul.f16 q0, q0, r7
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
; CHECK-NEXT: vldrw.u32 q1, [r10, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r12
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vfma.f16 q0, q1, r8
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r2]
; CHECK-NEXT: ldr.w r9, [r0, #4]
; CHECK-NEXT: ldr.w r10, [r0, #4]
; CHECK-NEXT: .LBB15_6: @ %if.end
; CHECK-NEXT: add.w r0, r9, r3, lsl #1
; CHECK-NEXT: lsr.w lr, r5, #2
; CHECK-NEXT: wls lr, lr, .LBB15_10
; CHECK-NEXT: add.w r0, r10, r3, lsl #1
; CHECK-NEXT: lsr.w r1, r9, #2
; CHECK-NEXT: wls lr, r1, .LBB15_10
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
; CHECK-NEXT: bic r2, r5, #3
; CHECK-NEXT: bic r2, r9, #3
; CHECK-NEXT: adds r1, r2, r3
; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: add.w r1, r9, r1, lsl #1
; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: add.w r1, r10, r1, lsl #1
; CHECK-NEXT: .LBB15_8: @ %while.body51
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
; CHECK-NEXT: vstrb.8 q0, [r3], #8
; CHECK-NEXT: le lr, .LBB15_8
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
; CHECK-NEXT: add.w r9, r9, r2, lsl #1
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: .LBB15_10: @ %while.end55
; CHECK-NEXT: ands r1, r5, #3
; CHECK-NEXT: ands r1, r9, #3
; CHECK-NEXT: beq .LBB15_12
; CHECK-NEXT: @ %bb.11: @ %if.then59
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vctp.16 r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r9]
; CHECK-NEXT: vstrht.16 q0, [r10]
; CHECK-NEXT: .LBB15_12: @ %if.end61
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
Expand Down Expand Up @@ -1052,36 +1052,36 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: cmp r3, #8
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %entry
; CHECK-NEXT: lsrs.w r12, r3, #2
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: ldrh r4, [r0]
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: movs r1, #1
; CHECK-NEXT: ldrd r5, r3, [r0, #4]
; CHECK-NEXT: sub.w r0, r4, #8
; CHECK-NEXT: and r8, r0, #7
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
; CHECK-NEXT: asr.w lr, r7, #3
; CHECK-NEXT: cmp.w lr, #1
; CHECK-NEXT: and r0, r0, #7
; CHECK-NEXT: asrs r6, r7, #3
; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: it gt
; CHECK-NEXT: asrgt r6, r7, #3
; CHECK-NEXT: asrgt r1, r7, #3
; CHECK-NEXT: add.w r7, r5, r4, lsl #1
; CHECK-NEXT: subs r7, #2
; CHECK-NEXT: str r7, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r1, r7, #2
; CHECK-NEXT: rsbs r7, r4, #0
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: add.w r7, r3, #16
; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_3: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vstrb.8 q0, [r2], #8
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
Expand All @@ -1090,40 +1090,39 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r3, #14]
; CHECK-NEXT: ldrh r0, [r3, #12]
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: ldrh r4, [r3, #10]
; CHECK-NEXT: ldrh r7, [r3, #8]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
; CHECK-NEXT: ldrh.w r8, [r3, #12]
; CHECK-NEXT: ldrh r7, [r3, #10]
; CHECK-NEXT: ldrh r4, [r3, #8]
; CHECK-NEXT: ldrh r6, [r3, #6]
; CHECK-NEXT: ldrh.w r9, [r3, #4]
; CHECK-NEXT: ldrh.w r11, [r3, #2]
; CHECK-NEXT: ldrh.w r10, [r3]
; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: adds r1, r5, #2
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmul.f16 q0, q0, r10
; CHECK-NEXT: adds r1, r5, #6
; CHECK-NEXT: adds r0, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r11
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r9
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: add.w r1, r5, #10
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r7
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vfma.f16 q0, q1, r7
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r8
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_7
Expand All @@ -1137,58 +1136,59 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r0, [r6], #16
; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: adds r1, r5, #2
; CHECK-NEXT: adds r4, r5, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: ldrh r0, [r6, #-14]
; CHECK-NEXT: adds r1, r5, #6
; CHECK-NEXT: adds r4, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-12]
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: ldrh r0, [r6, #-10]
; CHECK-NEXT: add.w r1, r5, #10
; CHECK-NEXT: add.w r4, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-8]
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: ldrh r0, [r6, #-6]
; CHECK-NEXT: ldrh r1, [r6, #-2]
; CHECK-NEXT: ldrh r4, [r6, #-2]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: ldrh r0, [r6, #-4]
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: adds r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r1
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_6
; CHECK-NEXT: b .LBB16_8
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: subs.w lr, r0, #0
; CHECK-NEXT: beq.w .LBB16_3
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: mov lr, r8
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r1, [r6], #2
; CHECK-NEXT: ldrh r4, [r6], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: subs.w lr, lr, #1
; CHECK-NEXT: vfma.f16 q0, q1, r1
; CHECK-NEXT: bne .LBB16_10
; CHECK-NEXT: b .LBB16_11
; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
; CHECK-NEXT: add.w r5, r5, r8, lsl #1
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r5, r5, r0, lsl #1
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #24
Expand Down Expand Up @@ -1450,12 +1450,12 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
; CHECK-NEXT: .LBB17_3: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB17_5 Depth 2
; CHECK-NEXT: vldrh.u16 q4, [r6]
; CHECK-NEXT: vldrh.u16 q3, [r6, #4]
; CHECK-NEXT: vldrh.u16 q3, [r6]
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: vmov q5, q4
; CHECK-NEXT: vmov q6, q3
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vshlc q5, r5, #16
; CHECK-NEXT: vldrh.u16 q4, [r6, #4]
; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vshlc q6, r5, #16
; CHECK-NEXT: vldrh.u16 q2, [r12]
; CHECK-NEXT: vmov.f32 s9, s1
Expand All @@ -1464,16 +1464,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
; CHECK-NEXT: @ %bb.4: @ %while.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: mov lr, r9
; CHECK-NEXT: .LBB17_5: @ %while.body
; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrh r7, [r1], #4
; CHECK-NEXT: vmov r4, s4
; CHECK-NEXT: vfma.f16 q2, q4, r7
; CHECK-NEXT: vfma.f16 q2, q3, r7
; CHECK-NEXT: ldrh r3, [r1, #-2]
; CHECK-NEXT: vmov.u16 r7, q2[0]
; CHECK-NEXT: vfma.f16 q2, q3, r7
; CHECK-NEXT: vfma.f16 q2, q4, r7
; CHECK-NEXT: vmov.16 q2[3], r4
; CHECK-NEXT: vfma.f16 q2, q5, r3
; CHECK-NEXT: vmov.u16 r3, q2[1]
Expand All @@ -1490,9 +1489,9 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
; CHECK-NEXT: @ %bb.7: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: vfma.f16 q2, q4, r1
; CHECK-NEXT: vmov.u16 r1, q2[0]
; CHECK-NEXT: vfma.f16 q2, q3, r1
; CHECK-NEXT: vmov.u16 r1, q2[0]
; CHECK-NEXT: vfma.f16 q2, q4, r1
; CHECK-NEXT: strh r1, [r5]
; CHECK-NEXT: vmovx.f16 s6, s8
; CHECK-NEXT: vstr.16 s6, [r12]
Expand Down
Loading