diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index f2ad5ee249b46..b786f7d8565a4 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -171,7 +171,14 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { return (MI.getOpcode() == TargetOpcode::IMPLICIT_DEF && MI.getNumOperands() == 1) || (MI.getDesc().isRematerializable() && - isReallyTriviallyReMaterializable(MI)); + isReMaterializableImpl(MI, true)); + } + + bool isReMaterializable(const MachineInstr &MI) const { + return (MI.getOpcode() == TargetOpcode::IMPLICIT_DEF && + MI.getNumOperands() == 1) || + (MI.getDesc().isRematerializable() && + isReMaterializableImpl(MI, false)); } /// Given \p MO is a PhysReg use return if it can be ignored for the purpose @@ -198,7 +205,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// predicate must return false if the instruction has any side effects other /// than producing a value, or if it requres any address registers that are /// not always available. - virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const; + virtual bool isReMaterializableImpl(const MachineInstr &MI, + bool TrivialOnly) const; /// This method commutes the operands of the given machine instruction MI. /// The operands to be commuted are specified by their indices OpIdx1 and diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index db00f54daeb62..2d84d6b385956 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1325,7 +1325,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (!TII->isAsCheapAsAMove(*DefMI)) return false; - if (!TII->isTriviallyReMaterializable(*DefMI)) + if (!TII->isReMaterializable(*DefMI)) return false; if (!definesFullReg(*DefMI, SrcReg)) diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 5be89b49fb6ba..da91bc186aac9 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -1590,8 +1590,8 @@ MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const { return MachineTraceStrategy::TS_MinInstrCount; } -bool TargetInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool TargetInstrInfo::isReMaterializableImpl( + const MachineInstr &MI, bool OnlyTrivial) const { const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1658,10 +1658,11 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializable( if (MO.isDef() && Reg != DefReg) return false; - // Don't allow any virtual-register uses. Rematting an instruction with - // virtual register uses would length the live ranges of the uses, which - // is not necessarily a good idea, certainly not "trivial". - if (MO.isUse()) + // If asked for trivial materialization, don't allow any virtual-register + // uses. Rematting an instruction with virtual register uses would length + // the live ranges of the uses, which means rematerialization must become + // a per-user query which many callers don't want. + if (OnlyTrivial && MO.isUse()) return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 13d05ee54d7b3..dc9e41aa097fb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -124,8 +124,8 @@ static bool canRemat(const MachineInstr &MI) { return false; } -bool SIInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool SIInstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const { if (canRemat(MI)) { // Normally VALU use of exec would block the rematerialization, but that @@ -139,13 +139,14 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable( // There is difference to generic method which does not allow // rematerialization if there are virtual register uses. We allow this, // therefore this method includes SOP instructions as well. + // FIXME: This should only be done if OnlyTrivial is setup. if (!MI.hasImplicitDef() && MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && !MI.mayRaiseFPException()) return true; } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI, OnlyTrivial); } // Returns true if the scalar result of a VALU instruction depends on exec. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e249fc6cbb79d..300aeaa31941e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -244,7 +244,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return ST; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; bool isIgnorableUse(const MachineOperand &MO) const override; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 5c35b3327c16d..9eb65533b682d 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6510,14 +6510,14 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( return Subtarget.isMClass() && MF.getFunction().hasMinSize(); } -bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool ARMBaseInstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool TrivialOnly) const { // Try hard to rematerialize any VCTPs because if we spill P0, it will block // the tail predication conversion. This means that the element count // register has to be live for longer, but that has to be better than // spill/restore and VPT predication. return (isVCTP(&MI) && !isPredicated(MI)) || - TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + TargetInstrInfo::isReMaterializableImpl(MI, TrivialOnly); } unsigned llvm::getBLXOpcode(const MachineFunction &MF) { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 71de3c6ad597a..57c53036349a6 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -479,7 +479,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; private: /// Modeling special VFP / NEON fp MLA / MLS hazards. diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 20ccc622f58dc..9565a55e4c6c5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -943,7 +943,7 @@ let Predicates = [IsLA64] in { def ADD_D : ALU_3R<0x00108000>; def SUB_D : ALU_3R<0x00118000>; // ADDI_D isn't always rematerializable, but isReMaterializable will be used as -// a hint which is verified in isReallyTriviallyReMaterializable. +// a hint which is verified in isReMaterializableImpl. // See LoongArchInstrInfo::isAsCheapAsAMove for more details. let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 55e38bcf4afc9..4d436d6d5d017 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1075,8 +1075,8 @@ Register PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, // For opcodes with the ReMaterializable flag set, this function is called to // verify the instruction is really rematable. -bool PPCInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool PPCInstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const { switch (MI.getOpcode()) { default: // Let base implementaion decide. @@ -1112,7 +1112,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable( case PPC::DMXXSETACCZ: return true; } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI, OnlyTrivial); } Register PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 63ebd65910572..61f580eb40344 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -530,7 +530,8 @@ class PPCInstrInfo : public PPCGenInstrInfo { unsigned &SubIdx) const override; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 0ed97c61ec78a..dd3a163592f01 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -232,8 +232,8 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return 0; } -bool RISCVInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool RISCVInstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const { switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { case RISCV::VMV_V_X: case RISCV::VFMV_V_F: @@ -241,9 +241,11 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable( case RISCV::VMV_S_X: case RISCV::VFMV_S_F: case RISCV::VID_V: + // FIXME: the v.x and v.f forms are not 'trivial' in the meaning + // of this API. Split them! return MI.getOperand(1).isUndef(); default: - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI, OnlyTrivial); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 57ec431749ebe..2d19a292da67c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -75,7 +75,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; bool shouldBreakCriticalEdgeToSink(MachineInstr &MI) const override { return MI.getOpcode() == RISCV::ADDI && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 47900cffa370c..9825733e29897 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -779,7 +779,7 @@ def SH : Store_rri<0b001, "sh">, Sched<[WriteSTH, ReadStoreData, ReadMemBase]>; def SW : Store_rri<0b010, "sw">, Sched<[WriteSTW, ReadStoreData, ReadMemBase]>; // ADDI isn't always rematerializable, but isReMaterializable will be used as -// a hint which is verified in isReallyTriviallyReMaterializable. +// a hint which is verified in isReMaterializableImpl. let isReMaterializable = 1, isAsCheapAsAMove = 1 in def ADDI : ALU_ri<0b000, "addi">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index feac04a17068a..88521fa5c06f5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -39,18 +39,18 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} -bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool WebAssemblyInstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const { switch (MI.getOpcode()) { case WebAssembly::CONST_I32: case WebAssembly::CONST_I64: case WebAssembly::CONST_F32: case WebAssembly::CONST_F64: - // TargetInstrInfo::isReallyTriviallyReMaterializable misses these + // TargetInstrInfo::isReMaterializableImpl misses these // because of the ARGUMENTS implicit def, so we manualy override it here. return true; default: - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI, OnlyTrivial); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index ba00097034bf5..aa11da3e3e388 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -37,7 +37,8 @@ class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 58d526269ff3c..4ccf7918b3741 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -755,8 +755,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool X86InstrInfo::isReallyTriviallyReMaterializable( - const MachineInstr &MI) const { +bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const { switch (MI.getOpcode()) { default: // This function should only be called for opcodes with the ReMaterializable @@ -951,7 +951,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable( break; } } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI, OnlyTrivial); } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 86133b3d969b1..1bd600c59cfc0 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -340,7 +340,8 @@ class X86InstrInfo final : public X86GenInstrInfo { Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI, + bool OnlyTrivial) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 8655bb1292ef7..0d4c250c5df62 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB3_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x10, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x11, x2, #32 ; CHECK-GI-NEXT: add x12, x0, #16 -; CHECK-GI-NEXT: mov x13, x10 +; CHECK-GI-NEXT: and x13, x9, #0xfffffff0 ; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB3_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 @@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB4_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x10, x2, #32 ; CHECK-GI-NEXT: add x11, x0, #16 -; CHECK-GI-NEXT: mov x12, x8 +; CHECK-GI-NEXT: and x12, x9, #0xfffffff0 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: and w13, w1, #0xffff @@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 ; CHECK-SD-NEXT: fmov s2, w9 ; CHECK-SD-NEXT: add x8, x0, #8 -; CHECK-SD-NEXT: mov x12, x11 +; CHECK-SD-NEXT: and x12, x10, #0xfffffff0 ; CHECK-SD-NEXT: .LBB5_5: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8] @@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: add x10, x0, #8 +; CHECK-GI-NEXT: and x11, x8, #0xfffffff0 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: dup v2.8h, w9 ; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 -; CHECK-GI-NEXT: mov x11, x9 ; CHECK-GI-NEXT: .LBB5_5: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8] diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll index 4c8e589391c3a..c23e4e182f2ef 100644 --- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll @@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef ; CHECK-NEXT: and x9, x8, #0xfffffff0 ; CHECK-NEXT: add x10, x1, #16 ; CHECK-NEXT: add x11, x0, #16 -; CHECK-NEXT: mov x12, x9 +; CHECK-NEXT: and x12, x8, #0xfffffff0 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q4, [x10, #-16] diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll index f6bbdf5d95d87..1770bb9794f09 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: and x12, x10, #0xfffffff0 ; CHECK-NEXT: add x13, x1, #32 -; CHECK-NEXT: add x14, x2, #16 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 @@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_6 Depth 2 ; CHECK-NEXT: // Child Loop BB0_9 Depth 2 -; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1] +; CHECK-NEXT: ldrsh w14, [x2, x9, lsl #1] ; CHECK-NEXT: cmp w0, #16 ; CHECK-NEXT: b.hs .LBB0_5 ; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: mov x17, xzr ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_5: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: dup v0.8h, w15 -; CHECK-NEXT: mov x16, x14 -; CHECK-NEXT: mov x17, x13 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: dup v0.8h, w14 +; CHECK-NEXT: add x15, x2, #16 +; CHECK-NEXT: mov x16, x13 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldp q1, q4, [x16, #-16] -; CHECK-NEXT: subs x18, x18, #16 -; CHECK-NEXT: ldp q3, q2, [x17, #-32] -; CHECK-NEXT: add x16, x16, #32 -; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: ldp q1, q4, [x15, #-16] +; CHECK-NEXT: subs x17, x17, #16 +; CHECK-NEXT: ldp q3, q2, [x16, #-32] +; CHECK-NEXT: add x15, x15, #32 +; CHECK-NEXT: ldp q6, q5, [x16] ; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q3, q2, [x17, #-32] -; CHECK-NEXT: stp q6, q5, [x17], #64 +; CHECK-NEXT: stp q3, q2, [x16, #-32] +; CHECK-NEXT: stp q6, q5, [x16], #64 ; CHECK-NEXT: b.ne .LBB0_6 ; CHECK-NEXT: // %bb.7: // %middle.block ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: cmp x12, x10 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: b.eq .LBB0_2 ; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: add x16, x18, x8 -; CHECK-NEXT: add x17, x2, x18, lsl #1 -; CHECK-NEXT: sub x18, x10, x18 -; CHECK-NEXT: add x16, x1, x16, lsl #2 +; CHECK-NEXT: add x15, x17, x8 +; CHECK-NEXT: add x16, x2, x17, lsl #1 +; CHECK-NEXT: sub x17, x10, x17 +; CHECK-NEXT: add x15, x1, x15, lsl #2 ; CHECK-NEXT: .LBB0_9: // %for.body4.us ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrsh w3, [x17], #2 -; CHECK-NEXT: ldr w4, [x16] -; CHECK-NEXT: subs x18, x18, #1 -; CHECK-NEXT: madd w3, w3, w15, w4 -; CHECK-NEXT: str w3, [x16], #4 +; CHECK-NEXT: ldrsh w18, [x16], #2 +; CHECK-NEXT: ldr w3, [x15] +; CHECK-NEXT: subs x17, x17, #1 +; CHECK-NEXT: madd w18, w18, w14, w3 +; CHECK-NEXT: str w18, [x15], #4 ; CHECK-NEXT: b.ne .LBB0_9 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll index 3caac1d13495d..74b0e69d1b05b 100644 --- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll +++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll @@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) { ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: and x20, x0, #0x3 ; CHECK-GI-NEXT: mov x19, x0 -; CHECK-GI-NEXT: mov x0, x20 +; CHECK-GI-NEXT: and x20, x0, #0x3 +; CHECK-GI-NEXT: and x0, x0, #0x3 ; CHECK-GI-NEXT: bl callee ; CHECK-GI-NEXT: tst x19, #0x3 ; CHECK-GI-NEXT: csel x0, x20, x0, eq diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 5fc996ad921ff..0f629971b5844 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: add x13, x1, #16 ; CHECK-NEXT: add x8, x1, x10, lsl #2 ; CHECK-NEXT: add x9, x0, x10 -; CHECK-NEXT: mov x14, x10 +; CHECK-NEXT: and x14, x11, #0x1fffffff8 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q2, [x13, #-16] @@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x10, lsl #3 ; CHECK-NEXT: add x9, x0, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: .LBB1_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32 @@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0] ; CHECK-NEXT: add x9, x10, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x9, lsl #2 ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: .LBB2_4: // %vector.body @@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0] ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll index 97980770cc7c9..e3ed31fae70c0 100644 --- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll +++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll @@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 { ; LINUX-NEXT: ld %f10, 8(%r3) ; LINUX-NEXT: ld %f0, 16(%r3) ; LINUX-NEXT: ld %f2, 24(%r3) -; LINUX-NEXT: la %r3, 16(%r2) -; LINUX-NEXT: la %r4, 48(%r2) ; LINUX-NEXT: la %r2, 176(%r15) +; LINUX-NEXT: la %r3, 16(%r13) +; LINUX-NEXT: la %r4, 48(%r13) ; LINUX-NEXT: std %f0, 176(%r15) ; LINUX-NEXT: std %f2, 184(%r15) ; LINUX-NEXT: brasl %r14, sincosl@PLT diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 6f986ce28381b..c418038b751d7 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: add.w r12, r4, r3, lsr #3 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll index e0a61b1f9d956..78dc35b283d18 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) { ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q2, q1 -; CHECK-NEXT: vaddva.u32 r2, q2 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vaddva.u32 r2, q1 ; CHECK-NEXT: letp lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index dad856c0677a1..5746c7950c6ec 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-NEXT: vmov.i32 q5, #0x0 ; CHECK-NEXT: vpsel q6, q4, q3 ; CHECK-NEXT: vstrh.16 q6, [r0] -; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.i32 q6, #0x0 ; CHECK-NEXT: cbz r1, .LBB0_2 ; CHECK-NEXT: le .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index f90af3cc5ba24..9ed0ff6c10e72 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -121,11 +121,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: add.w r10, r4, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 @@ -257,16 +257,16 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: adds r2, r5, #1 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r4, r10 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 @@ -427,17 +427,17 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #2 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r5, r11 ; CHECK-NEXT: mov r4, r9 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 @@ -617,18 +617,18 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: add.w r11, r0, #1 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 @@ -830,17 +830,17 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 @@ -1065,19 +1065,20 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r0, #1 ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 @@ -1346,21 +1347,23 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 ; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q6, q3 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q7, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: dls lr, r5 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 @@ -1379,9 +1382,9 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r5] ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload @@ -1397,13 +1400,13 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vldrwt.u32 q1, [r5] ; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll index 29c4fb902bf36..413c4a14a2593 100644 --- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll @@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB34_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r1], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r1], #16 ; CHECK-NEXT: letp lr, .LBB34_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB35_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r0], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r0], #16 ; CHECK-NEXT: letp lr, .LBB35_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc}